From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:32 +0100 Subject: x86, mce: disable machine checks on suspend Impact: Bug fix During suspend it is not reliable to process machine check exceptions, because CPUs disappear but can still get machine check broadcasts. Also the system is slightly more likely to machine check them, but the handler is typically not a position to handle them in a meaningfull way. So disable them during suspend and enable them during resume. Also make sure they are always disabled on hot-unplugged CPUs. This new code assumes that suspend always hotunplugs all non BP CPUs. v2: Remove the WARN_ONs Thomas objected to. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25cf624eccb..5ed80991ab9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -752,6 +775,8 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; -- cgit v1.2.3 From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:27 +0100 Subject: x86, mce: don't disable machine checks during code patching Impact: low priority bug fix This removes part of a a patch I added myself some time ago. After some consideration the patch was a bad idea. In particular it stopped machine check exceptions during code patching. To quote the comment: * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. So undo the machine check related parts of 8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled. This only removes code, the only additions are a new comment. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 17 +++++++++++------ arch/x86/kernel/cpu/mcheck/mce_32.c | 14 -------------- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 -------------- 3 files changed, 11 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a84ac7b570e..5b8394a3a6b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); -#ifdef CONFIG_X86_MCE - stop_mce(); -#endif + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); -#ifdef CONFIG_X86_MCE - restart_mce(); -#endif } /** diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633..3552119b091 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 5ed80991ab9..25ccdbec86e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ -- cgit v1.2.3 From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:28 +0100 Subject: x86, mce: always use separate work queue to run trigger Impact: Needed for bug fix in next patch This relaxes the requirement that mce_notify_user has to run in process context. Useful for future changes, but also leads to cleaner behaviour now. Now instead mce_notify_user can be called directly from interrupt (but not NMI) context. The work queue only uses a single global work struct, which can be done safely because it is always free to reuse before the trigger function is executed. This way no events can be lost. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25ccdbec86e..18b379cf061 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work) schedule_delayed_work(&mcheck_work, next_interval); } +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { @@ -394,9 +400,14 @@ int mce_notify_user(void) unsigned long now = jiffies; wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; -- cgit v1.2.3 From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:29 +0100 Subject: x86, mce: switch machine check polling to per CPU timer Impact: Higher priority bug fix The machine check poller runs a single timer and then broadcasted an IPI to all CPUs to check them. This leads to unnecessary synchronization between CPUs. The original CPU running the timer has to wait potentially a long time for all other CPUs answering. This is also real time unfriendly and in general inefficient. This was especially a problem on systems with a lot of events where the poller run with a higher frequency after processing some events. There could be more and more CPU time wasted with this, to the point of significantly slowing down machines. The machine check polling is actually fully independent per CPU, so there's no reason to not just do this all with per CPU timers. This patch implements that. Also switch the poller also to use standard timers instead of work queues. It was using work queues to be able to execute a user program on a event, but mce_notify_user() handles this case now with a separate callback. So instead always run the poll code in in a standard per CPU timer, which means that in the common case of not having to execute a trigger there will be less overhead. This allows to clean up the initialization significantly, because standard timers are already up when machine checks get init'ed. No multiple initialization functions. Thanks to Thomas Gleixner for some help. Cc: thockin@google.com v2: Use del_timer_sync() on cpu shutdown and don't try to handle migrated timers. v3: Add WARN_ON for timer running on unexpected CPU Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 68 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 18b379cf061..3f0550d16f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { + struct timer_list *t = &per_cpu(mce_timer, data); + + WARN_ON(smp_processor_id() != data); + if (mce_available(¤t_cpu_data)) do_machine_check(NULL, 0); -} - -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); } static void mce_do_trigger(struct work_struct *work) @@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ @@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. @@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { @@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer_on(t, cpu); + break; } return NOTIFY_OK; } -- cgit v1.2.3 From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:30 +0100 Subject: x86, mce: don't set up mce sysdev devices with mce=off Impact: bug fix, in this case the resume handler shouldn't run which avoids incorrectly reenabling machine checks on resume When MCEs are completely disabled on the command line don't set up the sysdev devices for them either. Includes a comment fix from Thomas Gleixner. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 3f0550d16f3..4e2b1bc5131 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { mce_cpu_quirks(c); - if (mce_dont_init || - !mce_available(c)) + if (!mce_available(c)) return; mce_init(NULL); @@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ -- cgit v1.2.3 From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:31 +0100 Subject: x86, mce: disable machine checks on offlined CPUs Impact: Lower priority bug fix Offlined CPUs could still get machine checks, but the machine check handler cannot handle them properly, leading to an unconditional crash. Disable machine checks on CPUs that are going down. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4e2b1bc5131..1db94c0d5aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu) cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void __cpuexit mce_disable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void __cpuexit mce_reenable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); break; } return NOTIFY_OK; -- cgit v1.2.3 From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 12 Feb 2009 13:39:34 +0100 Subject: x86, mce: fix a race condition in mce_read() Impact: bugfix Considering the situation as follow: before: mcelog.next == 1, mcelog.entry[0].finished = 1 +-------------------------------------------------------------------------- R W1 W2 W3 read mcelog.next (1) mcelog.next++ (2) (working on entry 1, finished == 0) mcelog.next = 0 mcelog.next++ (1) (working on entry 0) mcelog.next++ (2) (working on entry 1) <----------------- race ----------------> (done on entry 1, finished = 1) (done on entry 1, finished = 1) To fix the race condition, a cmpxchg loop is added to mce_read() to ensure no new MCE record can be added between mcelog.next reading and mcelog.next = 0. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 41 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1db94c0d5aa..870d08deccf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); -- cgit v1.2.3 From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2009 23:07:13 +0100 Subject: x86, mce: implement dynamic machine check banks support Impact: cleanup; making code future proof; memory saving on small systems This patch replaces the hardcoded max number of machine check banks with dynamic allocation depending on what the CPU reports. The sysfs data structures and the banks array are dynamically allocated. There is still a hard bank limit (128) because the mcelog protocol uses banks >= 128 as pseudo banks to escape other events. But we expect that 128 banks is beyond any reasonable CPU for now. This supersedes an earlier patch by Venki, but it solves the problem more completely by making the limit fully dynamic (up to the 128 boundary). This saves some memory on machines with less than 6 banks because they won't need sysdevs for unused ones and also allows to use sysfs to control these banks on possible future CPUs with more than 6 banks. This is an updated patch addressing Venki's comments. I also added in another patch from Thomas which fixed the error allocation path (that patch was previously separated) Cc: Venki Pallipadi Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 147 ++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 870d08deccf..2297730bb51 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,7 +34,12 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 + +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) atomic_t mce_entry; @@ -47,7 +54,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + if (!bank[i]) continue; m.misc = 0; @@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init); /* * Initialize Machine Checks for a CPU. */ -static void mce_init(void *dummy) +static int mce_cap_init(void) { u64 cap; - int i; + unsigned b; rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); } + /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) rip_msr = MSR_IA32_MCG_EIP; + return 0; +} + +static void mce_init(void *dummy) +{ + u64 cap; + int i; + /* Log the machine checks left over from the previous reset. This also clears all registers */ do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -532,11 +556,15 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); - if (!mce_available(c)) return; + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%Lx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } @@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -980,6 +1058,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; -- cgit v1.2.3 From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:22 +0100 Subject: x86, mce: factor out duplicated struct mce setup into one function Impact: cleanup This merely factors out duplicated code to set up the initial struct mce state into a single function. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 2297730bb51..fed875742b1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) || !banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) @@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); + if (error_code < 0) + m.tsc = 0; if (error_code != -2) mce_log(&m); @@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 8ae8c4ff094..75d9dd25e3d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void) exit_idle(); irq_enter(); - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b48f251fd3..7f7f1015ef1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); -- cgit v1.2.3 From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:23 +0100 Subject: x86, mce: separate correct machine check poller and fatal exception handler Impact: cleanup, performance enhancement The machine check poller is diverging more and more from the fatal exception handler. Instead of adding more special cases separate the code paths completely. The corrected poll path is actually quite simple, and this doesn't result in much code duplication. This makes both handlers much easier to read and results in cleaner code flow. The exception handler now only needs to care about uncorrected errors, which also simplifies the handling of multiple errors. The corrected poller also now always runs in standard interrupt context and does not need to do anything special to handle NMI context. Minor behaviour changes: - MCG status is now not cleared on polling. - Only the banks which had corrected errors get cleared on polling - The exception handler only clears banks with errors now v2: Forward port to new patch order. Add "uc" argument. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 129 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- 2 files changed, 109 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fed875742b1..268b05edade 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include @@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; mce_setup(&m); @@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); if (!bank[i]) continue; @@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code) if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code < 0) - m.tsc = 0; - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -494,9 +580,10 @@ static void mce_init(void *dummy) u64 cap; int i; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + /* + * Log the machine checks left over from the previous reset. + */ + machine_check_poll(MCP_UC); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 75d9dd25e3d..0069c653f4e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3 From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Feb 2009 15:44:58 -0800 Subject: x86, mce: use %ll instead of %L for 64-bit numbers Impact: Cleanup The standard spelling of a printf pattern for long long is "ll", not "L", which is for long double. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 268b05edade..60a114ca14f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -136,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%Lx\n", b); + return sprintf(buf, "%llx\n", b); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, -- cgit v1.2.3 From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Feb 2009 14:01:04 -0800 Subject: x86, mce: remove invalid __cpuinit/__cpuexit annotations Impact: Bug fix when CPU hotplug is disabled Correct the following broken __cpuinit/__cpuexit annotations: - mce_cpu_features() is called from mce_resume(), and so cannot be __cpuinit. - mce_disable_cpu() and mce_reenable_cpu() are called from mce_cpu_callback(), and so cannot be __cpuexit(). Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 60a114ca14f..0625993bf95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -598,7 +598,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { @@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuexit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { int i; @@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } -static void __cpuexit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { int i; -- cgit v1.2.3 From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:30 +0100 Subject: x86, mce, cmci: export MAX_NR_BANKS Impact: Cleanup (code movement) Move MAX_NR_BANKS into mce.h because it's needed there for followup patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a4a7c686ce9..39f8bb525a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,12 +37,6 @@ #define MISC_MCELOG_MINOR 227 -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - atomic_t mce_entry; static int mce_dont_init; -- cgit v1.2.3 From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:31 +0100 Subject: x86, mce, cmci: factor out threshold interrupt handler Impact: cleanup; preparation for feature The mce_amd_64 code has an own private MC threshold vector with an own interrupt handler. Since Intel needs a similar handler it makes sense to share the vector because both can not be active at the same time. I factored the common APIC handler code into a separate file which can be used by both the Intel or AMD MC code. This is needed for the next patch which adds an Intel specific CMCI handler. This patch should be a nop for AMD, it just moves some code around. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 15 ++++++--------- arch/x86/kernel/cpu/mcheck/threshold.c | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/threshold.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb6..b2f89829bbe 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index e82c8208b81..49705be9820 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - mce_setup(&m); /* assume first bank caused it */ @@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..4319142413d --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,24 @@ +/* Common corrected MCE threshold handler code */ +#include +#include +#include +#include +#include + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); +} -- cgit v1.2.3 From f9695df42cdbca78530b4458c38ecfdd0bb90079 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:32 +0100 Subject: x86, mce, cmci: avoid potential reentry of threshold interrupt Impact: minor bugfix The threshold handler on AMD (and soon on Intel) could be theoretically reentered by the hardware. This could lead to corrupted events because the machine check poll code assumes it is not reentered. Move the APIC ACK to the end of the interrupt handler to let the hardware avoid that. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/threshold.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 4319142413d..e4b8a3833fc 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -15,10 +15,11 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void mce_threshold_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); } -- cgit v1.2.3 From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:33 +0100 Subject: x86, mce: replace machine check events logged interval with ratelimit Impact: behavior change, use common code Use a standard leaky bucket ratelimit for the machine check warning print interval instead of waiting every check_interval. Also decrease the limit to twice per minute. This interacts better with threshold interrupts because they can happen more often than check_interval. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 39f8bb525a7..9017609cadd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); /* @@ -503,10 +504,8 @@ int mce_notify_user(void) if (trigger[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } -- cgit v1.2.3 From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:34 +0100 Subject: x86, mce, cmci: use polled banks bitmap in machine check poller Define a per cpu bitmap that contains the banks polled by the machine check poller. This is needed for the CMCI code in the next patches to be able to disable polling on specific banks. The bank by default contains all banks, so there is no behaviour change. Only future code will remove some banks from the polling set. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9017609cadd..a8ff38bfa6e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) * * This is executed in standard interrupt context. */ -void machine_check_poll(enum mcp_flags flags) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; @@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags) rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (!bank[i] || !test_bit(i, *b)) continue; m.misc = 0; @@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -572,11 +578,13 @@ static void mce_init(void *dummy) { u64 cap; int i; + mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. */ - machine_check_poll(MCP_UC); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 49705be9820..ee8bfcd3aa3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); -- cgit v1.2.3 From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:36 +0100 Subject: x86, mce, cmci: add CMCI support Impact: Major new feature Intel CMCI (Corrected Machine Check Interrupt) is a new feature on Nehalem CPUs. It allows the CPU to trigger interrupts on corrected events, which allows faster reaction to them instead of with the traditional polling timer. Also use CMCI to discover shared banks. Machine check banks can be shared by CPU threads or even cores. Using the CMCI enable bit it is possible to detect the fact that another CPU already saw a specific bank. Use this to assign shared banks only to one CPU to avoid reporting duplicated events. On CPU hot unplug bank sharing is re discovered. This is done using a thread that cycles through all the CPUs. To avoid races between the poller and CMCI we only poll for banks that are not CMCI capable and only check CMCI owned banks on a interrupt. The shared banks ownership information is currently only used for CMCI interrupts, not polled banks. The sharing discovery code follows the algorithm recommended in the IA32 SDM Vol3a 14.5.2.1 The CMCI interrupt handler just calls the machine check poller to pick up the machine check event that caused the interrupt. I decided not to implement a separate threshold event like the AMD version has, because the threshold is always one currently and adding another event didn't seem to add any value. Some code inspired by Yunhong Jiang's Xen implementation, which was in term inspired by a earlier CMCI implementation by me. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 205 ++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a8ff38bfa6e..bfbd5323a63 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { if (mce_dont_init) return 0; @@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) static void mce_disable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } @@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); } @@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 1b1491a76b5..a518ec8c6f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include @@ -12,6 +14,7 @@ #include #include #include +#include asmlinkage void smp_thermal_interrupt(void) { @@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static __cpuinit int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static __cpuinit void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +__cpuinit void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void __cpuexit cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void __cpuexit cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } -- cgit v1.2.3 From df20e2eb3e59b8625021a1bc8b1b53a4edc6008b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 24 Feb 2009 13:19:02 -0800 Subject: x86, mce, cmci: remove incorrect __cpuinit/__cpuexit annotations Impact: Bug fix on UP The MCE code is reinitialized from resume, so we can't use __cpuinit/__cpuexit for most of the code. Remove those annotations for anything downstream of mce_init(). Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a518ec8c6f8..7a2e10fcfa3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -104,7 +104,7 @@ static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 -static __cpuinit int cmci_supported(int *banks) +static int cmci_supported(int *banks) { u64 cap; @@ -147,7 +147,7 @@ static void print_update(char *type, int *hdr, int num) * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static __cpuinit void cmci_discover(int banks, int boot) +static void cmci_discover(int banks, int boot) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); int hdr = 0; @@ -192,7 +192,7 @@ static __cpuinit void cmci_discover(int banks, int boot) * Just in case we missed an event during initialization check * all the CMCI owned banks. */ -__cpuinit void cmci_recheck(void) +void cmci_recheck(void) { unsigned long flags; int banks; @@ -208,7 +208,7 @@ __cpuinit void cmci_recheck(void) * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. */ -void __cpuexit cmci_clear(void) +void cmci_clear(void) { int i; int banks; @@ -233,7 +233,7 @@ void __cpuexit cmci_clear(void) * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ -void __cpuexit cmci_rediscover(int dying) +void cmci_rediscover(int dying) { int banks; int cpu; -- cgit v1.2.3 From 5ca8681ca10f671427710f4954644359856581a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:37 +0100 Subject: x86, mce, cmci: disable CMCI on rebooting Impact: Avoids confusing other OSes. Disable the CMCI vector on reboot to avoid confusing other OS. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 570f36e44e5..648676f0b50 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -868,6 +868,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + /* * Clean APIC state for other OSs: */ -- cgit v1.2.3 From be71b8553d0522aba535a815baaebb1f0bb9a9ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:38 +0100 Subject: x86, mce, cmci: recheck CMCI banks after APIC has been enabled on CPU #0 Impact: Fix marginal race condition One the first CPU the machine checks are enabled early before the local APIC is enabled. This could in theory lead to some lost CMCI events very early during boot because CMCIs cannot be delivered with disabled LAPIC. The poller also doesn't recover from this because it doesn't check CMCI banks. Add an explicit CMCI banks check after the LAPIC is enabled. This is only done for CPU #0, the other CPUs only initialize machine checks after the LAPIC is on. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 648676f0b50..57b53774d98 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -1270,6 +1271,12 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (smp_processor_id() == 0) + cmci_recheck(); +#endif } void __cpuinit end_local_APIC_setup(void) -- cgit v1.2.3 From 73af76dfd1f998dba71d8e8e785cbe77a990bf17 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Mar 2009 11:47:17 +0100 Subject: x86, mce: fix build failure in arch/x86/kernel/cpu/mcheck/threshold.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: build fix The APIC code rewrite in the x86 tree broke the x86/mce branch: arch/x86/kernel/cpu/mcheck/threshold.c: In function ‘mce_threshold_interrupt’: arch/x86/kernel/cpu/mcheck/threshold.c:24: error: implicit declaration of function ‘ack_APIC_irq’ Also tidy up the file a bit while at it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/threshold.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index e4b8a3833fc..23ee9e730f7 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -1,9 +1,13 @@ -/* Common corrected MCE threshold handler code */ -#include +/* + * Common corrected MCE threshold handler code: + */ #include -#include +#include + #include +#include #include +#include static void default_threshold_interrupt(void) { -- cgit v1.2.3 From acaabe795a62bba089c185917af86b44654313dc Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:56:05 -0600 Subject: x86: UV, SGI RTC: add generic system vector This patch allocates a system interrupt vector for various platform specific uses. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185605.GA24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit_32.c | 3 +++ arch/x86/kernel/irqinit_64.c | 3 +++ 4 files changed, 42 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 83d1836b946..7ba4621c0df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f13ca1650aa..b864341dcc4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -15,6 +15,9 @@ atomic_t irq_err_count; +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); #endif + if (generic_interrupt_extension) { + seq_printf(p, "PLT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } #ifdef CONFIG_SMP seq_printf(p, "RES: "); for_each_online_cpu(j) @@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; #endif + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 50b8c3a3006..bc132610544 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -175,6 +175,9 @@ void __init native_init_IRQ(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f3..c7a49e0ffbf 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -147,6 +147,9 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -- cgit v1.2.3 From 5ab5ab34498f94d60884c4ccea890601e429042e Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:59:18 -0600 Subject: x86: UV, SGI RTC: add UV RTC clocksource/clockevents This patch provides a high resolution clock/timer source using the SGI UV system-wide synchronized RTC clock/timer hardware. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185918.GC24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/uv_time.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_time.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 95f216bbfaf..339ce35648e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o + obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 00000000000..6f8e3256ab2 --- /dev/null +++ b/arch/x86/kernel/uv_time.c @@ -0,0 +1,391 @@ +/* + * SGI RTC clock/timer routines. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Dimitri Sivanich + */ +#include + +#include +#include +#include +#include + +#define RTC_NAME "sgi_rtc" + +static cycle_t uv_read_rtc(void); +static int uv_rtc_next_event(unsigned long, struct clock_event_device *); +static void uv_rtc_timer_setup(enum clock_event_mode, + struct clock_event_device *); + +static struct clocksource clocksource_uv = { + .name = RTC_NAME, + .rating = 400, + .read = uv_read_rtc, + .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static struct clock_event_device clock_event_device_uv = { + .name = RTC_NAME, + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 20, + .rating = 400, + .irq = -1, + .set_next_event = uv_rtc_next_event, + .set_mode = uv_rtc_timer_setup, + .event_handler = NULL, +}; + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +/* There is one of these allocated per node */ +struct uv_rtc_timer_head { + spinlock_t lock; + /* next cpu waiting for timer, local node relative: */ + int next_cpu; + /* number of cpus on this node: */ + int ncpus; + struct { + int lcpu; /* systemwide logical cpu number */ + u64 expires; /* next timer expiration for this cpu */ + } cpu[1]; +}; + +/* + * Access to uv_rtc_timer_head via blade id. + */ +static struct uv_rtc_timer_head **blade_info __read_mostly; + +static int uv_rtc_enable; + +/* + * Hardware interface routines + */ + +/* Send IPIs to another node */ +static void uv_rtc_send_IPI(int cpu) +{ + unsigned long apicid, val; + int pnode; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + pnode = uv_apicid_to_pnode(apicid); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +/* Check for an RTC interrupt pending */ +static int uv_intr_pending(int pnode) +{ + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UVH_EVENT_OCCURRED0_RTC1_MASK; +} + +/* Setup interrupt and return non-zero if early expiration occurred. */ +static int uv_setup_intr(int cpu, u64 expires) +{ + u64 val; + int pnode = uv_cpu_to_pnode(cpu); + + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); + + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UVH_EVENT_OCCURRED0_RTC1_MASK); + + val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + + /* Set configuration */ + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); + /* Initialize comparator value */ + uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); + + return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); +} + +/* + * Per-cpu timer tracking routines + */ + +static __init void uv_rtc_deallocate_timers(void) +{ + int bid; + + for_each_possible_blade(bid) { + kfree(blade_info[bid]); + } + kfree(blade_info); +} + +/* Allocate per-node list of cpu timer expiration times. */ +static __init int uv_rtc_allocate_timers(void) +{ + int cpu; + + blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); + if (!blade_info) + return -ENOMEM; + memset(blade_info, 0, uv_possible_blades * sizeof(void *)); + + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int bid = uv_cpu_to_blade_id(cpu); + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + struct uv_rtc_timer_head *head = blade_info[bid]; + + if (!head) { + head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + + (uv_blade_nr_possible_cpus(bid) * + 2 * sizeof(u64)), + GFP_KERNEL, nid); + if (!head) { + uv_rtc_deallocate_timers(); + return -ENOMEM; + } + spin_lock_init(&head->lock); + head->ncpus = uv_blade_nr_possible_cpus(bid); + head->next_cpu = -1; + blade_info[bid] = head; + } + + head->cpu[bcpu].lcpu = cpu; + head->cpu[bcpu].expires = ULLONG_MAX; + } + + return 0; +} + +/* Find and set the next expiring timer. */ +static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) +{ + u64 lowest = ULLONG_MAX; + int c, bcpu = -1; + + head->next_cpu = -1; + for (c = 0; c < head->ncpus; c++) { + u64 exp = head->cpu[c].expires; + if (exp < lowest) { + bcpu = c; + lowest = exp; + } + } + if (bcpu >= 0) { + head->next_cpu = bcpu; + c = head->cpu[bcpu].lcpu; + if (uv_setup_intr(c, lowest)) + /* If we didn't set it up in time, trigger */ + uv_rtc_send_IPI(c); + } else { + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + } +} + +/* + * Set expiration time for current cpu. + * + * Returns 1 if we missed the expiration time. + */ +static int uv_rtc_set_timer(int cpu, u64 expires) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int next_cpu; + + spin_lock_irqsave(&head->lock, flags); + + next_cpu = head->next_cpu; + *t = expires; + /* Will this one be next to go off? */ + if (next_cpu < 0 || bcpu == next_cpu || + expires < head->cpu[next_cpu].expires) { + head->next_cpu = bcpu; + if (uv_setup_intr(cpu, expires)) { + *t = ULLONG_MAX; + uv_rtc_find_next_timer(head, pnode); + spin_unlock_irqrestore(&head->lock, flags); + return 1; + } + } + + spin_unlock_irqrestore(&head->lock, flags); + return 0; +} + +/* + * Unset expiration time for current cpu. + * + * Returns 1 if this timer was pending. + */ +static int uv_rtc_unset_timer(int cpu) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&head->lock, flags); + + if (head->next_cpu == bcpu && uv_read_rtc() >= *t) + rc = 1; + + *t = ULLONG_MAX; + + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + + spin_unlock_irqrestore(&head->lock, flags); + + return rc; +} + + +/* + * Kernel interface routines. + */ + +/* + * Read the RTC. + */ +static cycle_t uv_read_rtc(void) +{ + return (cycle_t)uv_read_local_mmr(UVH_RTC); +} + +/* + * Program the next event, relative to now + */ +static int uv_rtc_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + int ced_cpu = cpumask_first(ced->cpumask); + + return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); +} + +/* + * Setup the RTC timer in oneshot mode + */ +static void uv_rtc_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int ced_cpu = cpumask_first(evt->cpumask); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here yet */ + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + uv_rtc_unset_timer(ced_cpu); + break; + } +} + +static void uv_rtc_interrupt(void) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + int cpu = smp_processor_id(); + + if (!ced || !ced->event_handler) + return; + + if (uv_rtc_unset_timer(cpu) != 1) + return; + + ced->event_handler(ced); +} + +static int __init uv_enable_rtc(char *str) +{ + uv_rtc_enable = 1; + + return 1; +} +__setup("uvrtc", uv_enable_rtc); + +static __init void uv_rtc_register_clockevents(struct work_struct *dummy) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + + *ced = clock_event_device_uv; + ced->cpumask = cpumask_of(smp_processor_id()); + clockevents_register_device(ced); +} + +static __init int uv_rtc_setup_clock(void) +{ + int rc; + + if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + return -ENODEV; + + generic_interrupt_extension = uv_rtc_interrupt; + + clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_uv.shift); + + rc = clocksource_register(&clocksource_uv); + if (rc) { + generic_interrupt_extension = NULL; + return rc; + } + + /* Setup and register clockevents */ + rc = uv_rtc_allocate_timers(); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + return rc; + } + + clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, + NSEC_PER_SEC, clock_event_device_uv.shift); + + clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / + sn_rtc_cycles_per_second; + + clock_event_device_uv.max_delta_ns = clocksource_uv.mask * + (NSEC_PER_SEC / sn_rtc_cycles_per_second); + + rc = schedule_on_each_cpu(uv_rtc_register_clockevents); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + uv_rtc_deallocate_timers(); + } + + return rc; +} +arch_initcall(uv_rtc_setup_clock); -- cgit v1.2.3 From 8d4dd919b46ed982da6ef6bf6fcec454cd7a5b1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:25:21 -0800 Subject: x86: ioremap mptable Impact: fix boot with mptable above max_low_mapped Try to use early_ioremap() to map MPC to make sure it works even it is at the end of ram. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4901.3090801@kernel.org> Signed-off-by: Ingo Molnar Reported-and-tested-by: Kevin O'Connor --- arch/x86/kernel/mpparse.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1ba..ae9060cb448 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct mpf_intel *mpf_found; +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + /* * Scan the memory blocks for an SMP configuration block. */ @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { + struct mpc_table *mpc; + unsigned long size; + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { + if (!smp_read_mpc(mpc, early)) { #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " "(tell your hw vendor)\n"); + early_iounmap(mpc, size); return; } + early_iounmap(mpc, size); if (early) return; -- cgit v1.2.3 From f62432395ec54e93f113091bcb2e2017eeed7683 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:25:54 -0800 Subject: x86: reserve exact size of mptable Impact: save a bit of RAM Get the exact size for the reserve_bootmem() call. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4922.605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ae9060cb448..e8192401da4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -716,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), BOOTMEM_DEFAULT); if (mpf->physptr) { - unsigned long size = PAGE_SIZE; + unsigned long size = get_mpc_size(mpf->physptr); #ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute -- cgit v1.2.3 From 1400b3faab8fedfffde5a7fe47098e2732d4aa76 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 16:02:46 -0600 Subject: x86: UV, SGI RTC: fix uv_time.c for UP Fix non-smp build of uv_time.c. Signed-off-by: Dimitri Sivanich LKML-Reference: <20090304220246.GC6288@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 6f8e3256ab2..2ffb6c53326 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #define RTC_NAME "sgi_rtc" @@ -84,7 +86,7 @@ static void uv_rtc_send_IPI(int cpu) unsigned long apicid, val; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); + apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | -- cgit v1.2.3 From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:16:51 -0800 Subject: x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state tests Impact: cleanup, micro-optimization Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default to remove the use of system_state tests in __virt_addr_valid() and __phys_addr(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b746deb9ebc..f28c56e6bf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -202,7 +202,9 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; EXPORT_SYMBOL(boot_cpu_data); #endif -- cgit v1.2.3 From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 7 Mar 2009 23:46:26 -0800 Subject: x86: remove smp_apply_quirks()/smp_checks() Impact: cleanup and code size reduction on 64-bit This code is only applied to Intel Pentium and AMD K7 32-bit cpus. Move those checks to intel_init()/amd_init() for 32-bit so 64-bit will not build this code. Also change to use cpu_index check to see if we need to emit warning. Signed-off-by: Yinghai Lu LKML-Reference: <49B377D2.8030108@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 52 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 25 +++++++++++++++ arch/x86/kernel/smpboot.c | 78 --------------------------------------------- 3 files changed, 77 insertions(+), 78 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..f47df59016c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } +static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_mask == 0) || + (c->x86_mask == 1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_mask == 0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_mask >= 2)) || + ((c->x86_model == 7) && (c->x86_mask >= 1)) || + (c->x86_model > 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + if (!test_taint(TAINT_UNSAFE_SMP)) + add_taint(TAINT_UNSAFE_SMP); + +valid_k7: + ; +#endif +} + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } set_cpu_cap(c, X86_FEATURE_K7); + + amd_k7_smp_check(c); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 25c559ba8d5..191117f1ad5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void) } #endif +static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +#endif +} + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif + + intel_smp_check(c); } #else static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 249334f5080..ef7d10170c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; - -/* Set if we find a B stepping CPU */ -static int __cpuinitdata smp_b_stepping; - #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) /* which logical CPUs are on which nodes */ @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } -static int __cpuinitdata unsafe_smp; - /* * Activate a secondary processor. */ @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) -{ - /* - * Mask B, Pentium, but not Pentium MMX - */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && - c->x86_model <= 3) - /* - * Remember we have B step Pentia with bugs - */ - smp_b_stepping = 1; - - /* - * Certain Athlons might work (for various values of 'work') in SMP - * but they are not certified as MP capable. - */ - if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - - if (num_possible_cpus() == 1) - goto valid_k7; - - /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) - goto valid_k7; - - /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) - goto valid_k7; - - /* - * Athlon 662, Duron 671, and Athlon >model 7 have capability - * bit. It's worth noting that the A5 stepping (662) of some - * Athlon XP's have the MP bit set. - * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for - * more. - */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || - (c->x86_model > 7)) - if (cpu_has_mp) - goto valid_k7; - - /* If we get here, not a certified SMP capable AMD system. */ - unsafe_smp = 1; - } - -valid_k7: - ; -} - -static void __cpuinit smp_checks(void) -{ - if (smp_b_stepping) - printk(KERN_WARNING "WARNING: SMP operation may be unreliable" - "with B stepping processors.\n"); - - /* - * Don't taint if we are running SMP kernel on a single non-MP - * approved Athlon - */ - if (unsafe_smp && num_online_cpus() > 1) { - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP); - } -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id) c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); - smp_apply_quirks(c); } @@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done.\n"); impress_friends(); - smp_checks(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif -- cgit v1.2.3 From 8c5dfd25519bf302ba43daa59976c4d675a594a7 Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Tue, 10 Mar 2009 00:10:32 -0500 Subject: x86: BUG to BUG_ON changes Impact: cleanup Signed-off-by: Stoyan Gaydarov LKML-Reference: <1236661850-8237-8-git-send-email-stoyboyker@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 ++---- arch/x86/kernel/quirks.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c87627..f8869978bbb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1078,8 +1078,7 @@ void __cpuinit cpu_init(void) atomic_inc(&init_mm.mm_count); me->active_mm = &init_mm; - if (me->mm) - BUG(); + BUG_ON(me->mm); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1145,8 +1144,7 @@ void __cpuinit cpu_init(void) */ atomic_inc(&init_mm.mm_count); curr->active_mm = &init_mm; - if (curr->mm) - BUG(); + BUG_ON(curr->mm); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 309949e9e1c..6a5a2970f4c 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void) if (!force_hpet_address) return; - if (rcba_base == NULL) - BUG(); + BUG_ON(rcba_base == NULL); /* read the Function Disable register, dword mode only */ val = readl(rcba_base + 0x3404); -- cgit v1.2.3 From 9b779edf4b97798d037bb44fca2719ac184d0f14 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 10 Mar 2009 15:37:51 +0530 Subject: x86: cpu architecture debug code Introduce: cat /sys/kernel/debug/x86/cpu/* for Intel and AMD processors to view / debug the state of each CPU. By using this we can debug whole range of registers and other cpu information for debugging purpose and monitor how things are changing. This can be useful for developers as well as for users. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1236701373.3387.4.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 + arch/x86/kernel/cpu/cpu_debug.c | 784 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 786 insertions(+) create mode 100755 arch/x86/kernel/cpu/cpu_debug.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82db7f45e2d..d4356f8b752 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,6 +14,8 @@ obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o +obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o + obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c new file mode 100755 index 00000000000..0bdf4daba20 --- /dev/null +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -0,0 +1,784 @@ +/* + * CPU x86 architecture debug code + * + * Copyright(C) 2009 Jaswinder Singh Rajput + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); +static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(unsigned, cpu_modelflag); +static DEFINE_PER_CPU(int, cpu_priv_count); +static DEFINE_PER_CPU(unsigned, cpu_model); + +static DEFINE_MUTEX(cpu_debug_lock); + +static struct dentry *cpu_debugfs_dir; + +static struct cpu_debug_base cpu_base[] = { + { "mc", CPU_MC }, /* Machine Check */ + { "monitor", CPU_MONITOR }, /* Monitor */ + { "time", CPU_TIME }, /* Time */ + { "pmc", CPU_PMC }, /* Performance Monitor */ + { "platform", CPU_PLATFORM }, /* Platform */ + { "apic", CPU_APIC }, /* APIC */ + { "poweron", CPU_POWERON }, /* Power-on */ + { "control", CPU_CONTROL }, /* Control */ + { "features", CPU_FEATURES }, /* Features control */ + { "lastbranch", CPU_LBRANCH }, /* Last Branch */ + { "bios", CPU_BIOS }, /* BIOS */ + { "freq", CPU_FREQ }, /* Frequency */ + { "mtrr", CPU_MTRR }, /* MTRR */ + { "perf", CPU_PERF }, /* Performance */ + { "cache", CPU_CACHE }, /* Cache */ + { "sysenter", CPU_SYSENTER }, /* Sysenter */ + { "therm", CPU_THERM }, /* Thermal */ + { "misc", CPU_MISC }, /* Miscellaneous */ + { "debug", CPU_DEBUG }, /* Debug */ + { "pat", CPU_PAT }, /* PAT */ + { "vmx", CPU_VMX }, /* VMX */ + { "call", CPU_CALL }, /* System Call */ + { "base", CPU_BASE }, /* BASE Address */ + { "smm", CPU_SMM }, /* System mgmt mode */ + { "svm", CPU_SVM }, /*Secure Virtial Machine*/ + { "osvm", CPU_OSVM }, /* OS-Visible Workaround*/ + { "tss", CPU_TSS }, /* Task Stack Segment */ + { "cr", CPU_CR }, /* Control Registers */ + { "dt", CPU_DT }, /* Descriptor Table */ + { "registers", CPU_REG_ALL }, /* Select all Registers */ +}; + +static struct cpu_file_base cpu_file[] = { + { "index", CPU_REG_ALL }, /* index */ + { "value", CPU_REG_ALL }, /* value */ +}; + +/* Intel Registers Range */ +static struct cpu_debug_range cpu_intel_range[] = { + { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, + { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, + { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, + { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, + { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, + + { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, + { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, + { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, + + { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, + { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, + { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, + { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, + + { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, + { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, + { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, + + { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, + { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, + { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, + + { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, + { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, + { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, + { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, + { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, + { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, + { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, + { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, + { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, + { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, + { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, + { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, + + { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, + { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, + { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, + { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, + { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, + + { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, + { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, + { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, + { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, + { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, + { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, + { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, + { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, + { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, + + { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, + { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, + { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, + { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, + { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, + { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, + { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, + { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, + + { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, + + { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, + { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, + { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +}; + +/* AMD Registers Range */ +static struct cpu_debug_range cpu_amd_range[] = { + { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, }, + { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, }, + { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, }, + { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, }, + + { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, }, + { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, }, + { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, }, + + { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, }, + + { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, }, + { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, }, + { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, }, + { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, }, + { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, }, + { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, }, + { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, }, + { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, }, + { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, }, + { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, }, + { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, }, + { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, }, + { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, }, +}; + + +static int get_cpu_modelflag(unsigned cpu) +{ + int flag; + + switch (per_cpu(cpu_model, cpu)) { + /* Intel */ + case 0x0501: + case 0x0502: + case 0x0504: + flag = CPU_INTEL_PENTIUM; + break; + case 0x0601: + case 0x0603: + case 0x0605: + case 0x0607: + case 0x0608: + case 0x060A: + case 0x060B: + flag = CPU_INTEL_P6; + break; + case 0x0609: + case 0x060D: + flag = CPU_INTEL_PENTIUM_M; + break; + case 0x060E: + flag = CPU_INTEL_CORE; + break; + case 0x060F: + case 0x0617: + flag = CPU_INTEL_CORE2; + break; + case 0x061C: + flag = CPU_INTEL_ATOM; + break; + case 0x0F00: + case 0x0F01: + case 0x0F02: + case 0x0F03: + case 0x0F04: + flag = CPU_INTEL_XEON_P4; + break; + case 0x0F06: + flag = CPU_INTEL_XEON_MP; + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + +static int get_cpu_range_count(unsigned cpu) +{ + int index; + + switch (per_cpu(cpu_model, cpu) >> 16) { + case X86_VENDOR_INTEL: + index = ARRAY_SIZE(cpu_intel_range); + break; + case X86_VENDOR_AMD: + index = ARRAY_SIZE(cpu_amd_range); + break; + default: + index = 0; + break; + } + + return index; +} + +static int is_typeflag_valid(unsigned cpu, unsigned flag) +{ + unsigned vendor, modelflag; + int i, index; + + /* Standard Registers should be always valid */ + if (flag >= CPU_TSS) + return 1; + + modelflag = per_cpu(cpu_modelflag, cpu); + vendor = per_cpu(cpu_model, cpu) >> 16; + index = get_cpu_range_count(cpu); + + for (i = 0; i < index; i++) { + switch (vendor) { + case X86_VENDOR_INTEL: + if ((cpu_intel_range[i].model & modelflag) && + (cpu_intel_range[i].flag & flag)) + return 1; + break; + case X86_VENDOR_AMD: + if (cpu_amd_range[i].flag & flag) + return 1; + break; + } + } + + /* Invalid */ + return 0; +} + +static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, + int index, unsigned flag) +{ + unsigned modelflag; + + modelflag = per_cpu(cpu_modelflag, cpu); + *max = 0; + switch (per_cpu(cpu_model, cpu) >> 16) { + case X86_VENDOR_INTEL: + if ((cpu_intel_range[index].model & modelflag) && + (cpu_intel_range[index].flag & flag)) { + *min = cpu_intel_range[index].min; + *max = cpu_intel_range[index].max; + } + break; + case X86_VENDOR_AMD: + if (cpu_amd_range[index].flag & flag) { + *min = cpu_amd_range[index].min; + *max = cpu_amd_range[index].max; + } + break; + } + + return *max; +} + +/* This function can also be called with seq = NULL for printk */ +static void print_cpu_data(struct seq_file *seq, unsigned type, + u32 low, u32 high) +{ + struct cpu_private *priv; + u64 val = high; + + if (seq) { + priv = seq->private; + if (priv->file) { + val = (val << 32) | low; + seq_printf(seq, "0x%llx\n", val); + } else + seq_printf(seq, " %08x: %08x_%08x\n", + type, high, low); + } else + printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); +} + +/* This function can also be called with seq = NULL for printk */ +static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) +{ + unsigned msr, msr_min, msr_max; + struct cpu_private *priv; + u32 low, high; + int i, range; + + if (seq) { + priv = seq->private; + if (priv->file) { + if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, + &low, &high)) + print_cpu_data(seq, priv->reg, low, high); + return; + } + } + + range = get_cpu_range_count(cpu); + + for (i = 0; i < range; i++) { + if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) + continue; + + for (msr = msr_min; msr <= msr_max; msr++) { + if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) + continue; + print_cpu_data(seq, msr, low, high); + } + } +} + +static void print_tss(void *arg) +{ + struct pt_regs *regs = task_pt_regs(current); + struct seq_file *seq = arg; + unsigned int seg; + + seq_printf(seq, " RAX\t: %016lx\n", regs->ax); + seq_printf(seq, " RBX\t: %016lx\n", regs->bx); + seq_printf(seq, " RCX\t: %016lx\n", regs->cx); + seq_printf(seq, " RDX\t: %016lx\n", regs->dx); + + seq_printf(seq, " RSI\t: %016lx\n", regs->si); + seq_printf(seq, " RDI\t: %016lx\n", regs->di); + seq_printf(seq, " RBP\t: %016lx\n", regs->bp); + seq_printf(seq, " ESP\t: %016lx\n", regs->sp); + +#ifdef CONFIG_X86_64 + seq_printf(seq, " R08\t: %016lx\n", regs->r8); + seq_printf(seq, " R09\t: %016lx\n", regs->r9); + seq_printf(seq, " R10\t: %016lx\n", regs->r10); + seq_printf(seq, " R11\t: %016lx\n", regs->r11); + seq_printf(seq, " R12\t: %016lx\n", regs->r12); + seq_printf(seq, " R13\t: %016lx\n", regs->r13); + seq_printf(seq, " R14\t: %016lx\n", regs->r14); + seq_printf(seq, " R15\t: %016lx\n", regs->r15); +#endif + + asm("movl %%cs,%0" : "=r" (seg)); + seq_printf(seq, " CS\t: %04x\n", seg); + asm("movl %%ds,%0" : "=r" (seg)); + seq_printf(seq, " DS\t: %04x\n", seg); + seq_printf(seq, " SS\t: %04lx\n", regs->ss); + asm("movl %%es,%0" : "=r" (seg)); + seq_printf(seq, " ES\t: %04x\n", seg); + asm("movl %%fs,%0" : "=r" (seg)); + seq_printf(seq, " FS\t: %04x\n", seg); + asm("movl %%gs,%0" : "=r" (seg)); + seq_printf(seq, " GS\t: %04x\n", seg); + + seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); + + seq_printf(seq, " EIP\t: %016lx\n", regs->ip); +} + +static void print_cr(void *arg) +{ + struct seq_file *seq = arg; + + seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); + seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); + seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); + seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); +#ifdef CONFIG_X86_64 + seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); +#endif +} + +static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) +{ + seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); +} + +static void print_dt(void *seq) +{ + struct desc_ptr dt; + unsigned long ldt; + + /* IDT */ + store_idt((struct desc_ptr *)&dt); + print_desc_ptr("IDT", seq, dt); + + /* GDT */ + store_gdt((struct desc_ptr *)&dt); + print_desc_ptr("GDT", seq, dt); + + /* LDT */ + store_ldt(ldt); + seq_printf(seq, " LDT\t: %016lx\n", ldt); + + /* TR */ + store_tr(ldt); + seq_printf(seq, " TR\t: %016lx\n", ldt); +} + +static void print_dr(void *arg) +{ + struct seq_file *seq = arg; + unsigned long dr; + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + get_debugreg(dr, i); + seq_printf(seq, " dr%d\t: %016lx\n", i, dr); + } + + seq_printf(seq, "\n MSR\t:\n"); +} + +static void print_apic(void *arg) +{ + struct seq_file *seq = arg; + +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(seq, " LAPIC\t:\n"); + seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); + seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); + seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); + seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); + seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); + seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); + seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); + seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); + seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); + seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); + seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); + seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); + seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); + seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); + seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); + seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); + seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); + seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); + seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); + seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); + seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); +#endif /* CONFIG_X86_LOCAL_APIC */ + + seq_printf(seq, "\n MSR\t:\n"); +} + +static int cpu_seq_show(struct seq_file *seq, void *v) +{ + struct cpu_private *priv = seq->private; + + if (priv == NULL) + return -EINVAL; + + switch (cpu_base[priv->type].flag) { + case CPU_TSS: + smp_call_function_single(priv->cpu, print_tss, seq, 1); + break; + case CPU_CR: + smp_call_function_single(priv->cpu, print_cr, seq, 1); + break; + case CPU_DT: + smp_call_function_single(priv->cpu, print_dt, seq, 1); + break; + case CPU_DEBUG: + if (priv->file == CPU_INDEX_BIT) + smp_call_function_single(priv->cpu, print_dr, seq, 1); + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + case CPU_APIC: + if (priv->file == CPU_INDEX_BIT) + smp_call_function_single(priv->cpu, print_apic, seq, 1); + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + + default: + print_msr(seq, priv->cpu, cpu_base[priv->type].flag); + break; + } + seq_printf(seq, "\n"); + + return 0; +} + +static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) /* One time is enough ;-) */ + return seq; + + return NULL; +} + +static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + + return cpu_seq_start(seq, pos); +} + +static void cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations cpu_seq_ops = { + .start = cpu_seq_start, + .next = cpu_seq_next, + .stop = cpu_seq_stop, + .show = cpu_seq_show, +}; + +static int cpu_seq_open(struct inode *inode, struct file *file) +{ + struct cpu_private *priv = inode->i_private; + struct seq_file *seq; + int err; + + err = seq_open(file, &cpu_seq_ops); + if (!err) { + seq = file->private_data; + seq->private = priv; + } + + return err; +} + +static const struct file_operations cpu_fops = { + .open = cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, + unsigned file, struct dentry *dentry) +{ + struct cpu_private *priv = NULL; + + /* Already intialized */ + if (file == CPU_INDEX_BIT) + if (per_cpu(cpu_arr[type].init, cpu)) + return 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + priv->cpu = cpu; + priv->type = type; + priv->reg = reg; + priv->file = file; + mutex_lock(&cpu_debug_lock); + per_cpu(priv_arr[type], cpu) = priv; + per_cpu(cpu_priv_count, cpu)++; + mutex_unlock(&cpu_debug_lock); + + if (file) + debugfs_create_file(cpu_file[file].name, S_IRUGO, + dentry, (void *)priv, &cpu_fops); + else { + debugfs_create_file(cpu_base[type].name, S_IRUGO, + per_cpu(cpu_arr[type].dentry, cpu), + (void *)priv, &cpu_fops); + mutex_lock(&cpu_debug_lock); + per_cpu(cpu_arr[type].init, cpu) = 1; + mutex_unlock(&cpu_debug_lock); + } + + return 0; +} + +static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, + struct dentry *dentry) +{ + unsigned file; + int err = 0; + + for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { + err = cpu_create_file(cpu, type, reg, file, dentry); + if (err) + return err; + } + + return err; +} + +static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) +{ + struct dentry *cpu_dentry = NULL; + unsigned reg, reg_min, reg_max; + int i, range, err = 0; + char reg_dir[12]; + u32 low, high; + + range = get_cpu_range_count(cpu); + + for (i = 0; i < range; i++) { + if (!get_cpu_range(cpu, ®_min, ®_max, i, + cpu_base[type].flag)) + continue; + + for (reg = reg_min; reg <= reg_max; reg++) { + if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) + continue; + + sprintf(reg_dir, "0x%x", reg); + cpu_dentry = debugfs_create_dir(reg_dir, dentry); + err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); + if (err) + return err; + } + } + + return err; +} + +static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) +{ + struct dentry *cpu_dentry = NULL; + unsigned type; + int err = 0; + + for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { + if (!is_typeflag_valid(cpu, cpu_base[type].flag)) + continue; + cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); + per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; + + if (type < CPU_TSS_BIT) + err = cpu_init_msr(cpu, type, cpu_dentry); + else + err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, + cpu_dentry); + if (err) + return err; + } + + return err; +} + +static int cpu_init_cpu(void) +{ + struct dentry *cpu_dentry = NULL; + struct cpuinfo_x86 *cpui; + char cpu_dir[12]; + unsigned cpu; + int err = 0; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + cpui = &cpu_data(cpu); + if (!cpu_has(cpui, X86_FEATURE_MSR)) + continue; + per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | + (cpui->x86 << 8) | + (cpui->x86_model)); + per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); + + sprintf(cpu_dir, "cpu%d", cpu); + cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); + err = cpu_init_allreg(cpu, cpu_dentry); + + pr_info("cpu%d(%d) debug files %d\n", + cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); + if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { + pr_err("Register files count %d exceeds limit %d\n", + per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); + per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; + err = -ENFILE; + } + if (err) + return err; + } + + return err; +} + +static int __init cpu_debug_init(void) +{ + cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); + + return cpu_init_cpu(); +} + +static void __exit cpu_debug_exit(void) +{ + int i, cpu; + + if (cpu_debugfs_dir) + debugfs_remove_recursive(cpu_debugfs_dir); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) + kfree(per_cpu(priv_arr[i], cpu)); +} + +module_init(cpu_debug_init); +module_exit(cpu_debug_exit); + +MODULE_AUTHOR("Jaswinder Singh Rajput"); +MODULE_DESCRIPTION("CPU Debug module"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From fef3a7a17418814733ebde0b40d8e32747677c8f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:56:57 +0800 Subject: x86, kexec: fix kexec x86 coding style Impact: Cleanup Fix some coding style issue for kexec x86. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_32.c | 17 ++++++++++------- arch/x86/kernel/machine_kexec_64.c | 15 ++++++++------- arch/x86/kernel/relocate_kernel_32.S | 24 ++++++++++++++++-------- arch/x86/kernel/relocate_kernel_64.S | 24 ++++++++++++++++-------- 4 files changed, 50 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f5fc8c781a6..e7368c1da01 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -14,12 +14,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -63,7 +63,7 @@ static void load_segments(void) "\tmovl %%eax,%%fs\n" "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" - ::: "eax", "memory"); + : : : "eax", "memory"); #undef STR #undef __STR } @@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC - /* We need to put APICs in legacy mode so that we can + /* + * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to disable_IO_APIC() in * one form or other. kexec jump path also need @@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6993d51b7fd..f8c796fffa0 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -12,11 +12,11 @@ #include #include #include +#include #include #include #include -#include static void init_level2_page(pmd_t *level2p, unsigned long addr) { @@ -83,9 +83,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, } level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); - if (result) { + if (result) goto out; - } set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); addr += PGDIR_SIZE; } @@ -242,7 +241,8 @@ void machine_kexec(struct kimage *image) page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); - /* The segment registers are funny things, they have both a + /* + * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the @@ -252,11 +252,12 @@ void machine_kexec(struct kimage *image) * segments, before I zap the gdt with an invalid value. */ load_segments(); - /* The gdt & idt are now invalid. + /* + * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); /* now call it */ relocate_kernel((unsigned long)image->head, (unsigned long)page_list, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 2064d0aa8d2..41235531b11 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -17,7 +17,8 @@ #define PTR(x) (x << 2) -/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for * jumping back */ @@ -76,8 +77,10 @@ relocate_kernel: movl %eax, CP_PA_SWAP_PAGE(%edi) movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi /* switch to new set of page tables */ @@ -97,7 +100,8 @@ identity_mapped: /* store the start address on the stack */ pushl %edx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging disabled * - Alignment check disabled * - Write protect disabled @@ -113,7 +117,8 @@ identity_mapped: /* clear cr4 if applicable */ testl %ecx, %ecx jz 1f - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * Setting everything to zero seems safe. */ xorl %eax, %eax @@ -132,15 +137,18 @@ identity_mapped: call swap_pages addl $8, %esp - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB, it's handy, and not processor dependent. */ xorl %eax, %eax movl %eax, %cr3 - /* set all of the registers to known values */ - /* leave %esp alone */ + /* + * set all of the registers to known values + * leave %esp alone + */ testl %esi, %esi jnz 1f diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d32cfb27a47..cfc0d24003d 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -24,7 +24,8 @@ .code64 .globl relocate_kernel relocate_kernel: - /* %rdi indirection_page + /* + * %rdi indirection_page * %rsi page_list * %rdx start address */ @@ -33,8 +34,10 @@ relocate_kernel: pushq $0 popfq - /* get physical address of control page now */ - /* this is impossible after page table switch */ + /* + * get physical address of control page now + * this is impossible after page table switch + */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ @@ -55,7 +58,8 @@ identity_mapped: /* store the start address on the stack */ pushq %rdx - /* Set cr0 to a known state: + /* + * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled @@ -68,7 +72,8 @@ identity_mapped: orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 - /* Set cr4 to a known state: + /* + * Set cr4 to a known state: * - physical address extension enabled */ movq $X86_CR4_PAE, %rax @@ -117,7 +122,8 @@ identity_mapped: jmp 0b 3: - /* To be certain of avoiding problems with self-modifying code + /* + * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB by reloading %cr3 here, it's handy, * and not processor dependent. @@ -125,8 +131,10 @@ identity_mapped: movq %cr3, %rax movq %rax, %cr3 - /* set all of the registers to known values */ - /* leave %rsp alone */ + /* + * set all of the registers to known values + * leave %rsp alone + */ xorq %rax, %rax xorq %rbx, %rbx -- cgit v1.2.3 From 5359454701ce51a4626b1ef6eb7b16ec35bd458d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:57:04 +0800 Subject: x86, kexec: x86_64: add identity map for pages at image->start Impact: Fix corner case that cannot yet occur image->start may be outside of 0 ~ max_pfn, for example when jumping back to original kernel from kexeced kenrel. This patch add identity map for pages at image->start. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index f8c796fffa0..7cc5d3d0148 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,41 @@ #include #include +static int init_one_level2_page(struct kimage *image, pgd_t *pgd, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + struct page *page; + int result = -ENOMEM; + + addr &= PMD_MASK; + pgd += pgd_index(addr); + if (!pgd_present(*pgd)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pud = (pud_t *)page_address(page); + memset(pud, 0, PAGE_SIZE); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + page = kimage_alloc_control_pages(image, 0); + if (!page) + goto out; + pmd = (pmd_t *)page_address(page); + memset(pmd, 0, PAGE_SIZE); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + result = 0; +out: + return result; +} + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -153,6 +188,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) int result; level4p = (pgd_t *)__va(start_pgtable); result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + if (result) + return result; + /* + * image->start may be outside 0 ~ max_pfn, for example when + * jump back to original kernel from kexeced kernel + */ + result = init_one_level2_page(image, level4p, image->start); if (result) return result; return init_transition_pgtable(image, level4p); -- cgit v1.2.3 From fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 10 Mar 2009 10:57:16 +0800 Subject: x86, kexec: x86_64: add kexec jump support for x86_64 Impact: New major feature This patch add kexec jump support for x86_64. More information about kexec jump can be found in corresponding x86_32 support patch. Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/machine_kexec_64.c | 42 ++++++++- arch/x86/kernel/relocate_kernel_64.S | 177 ++++++++++++++++++++++++++++------- arch/x86/kernel/vmlinux_64.lds.S | 7 ++ 3 files changed, 189 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7cc5d3d0148..89cea4d4467 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0), 0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index cfc0d24003d..4de8f5b3d47 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -19,6 +19,24 @@ #define PTR(x) (x << 3) #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + .text .align PAGE_SIZE .code64 @@ -28,8 +46,27 @@ relocate_kernel: * %rdi indirection_page * %rsi page_list * %rdx start address + * %rcx preserve_context */ + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + /* zero out flags, and disable interrupts */ pushq $0 popfq @@ -41,10 +78,18 @@ relocate_kernel: movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ - movq %rcx, %cr3 + movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -83,9 +128,87 @@ identity_mapped: 1: /* Flush the TLB (needed?) */ - movq %rcx, %cr3 + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret /* Do the copies */ +swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi @@ -117,39 +240,27 @@ identity_mapped: movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi movq $512, %rcx rep ; movsq - jmp 0b -3: - /* - * To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* - * set all of the registers to known values - * leave %rsp alone - */ + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fbfced6f680..5bf54e40c6e 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), ASSERT((per_cpu__irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif -- cgit v1.2.3 From 5490fa96735ce0e2af270c0868987d644b9a38ec Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 11 Mar 2009 10:14:26 +0900 Subject: x86, mce: use round_jiffies() instead round_jiffies_relative() Impact: saving power _very_ little round_jiffies() round up absolute jiffies to full second. round_jiffies_relative() round up relative jiffies to full second. The "t->expires" is absolute jiffies. Then, round_jiffies() should be used instead round_jiffies_relative(). Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index bfbd5323a63..ca14604611e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -639,7 +639,7 @@ static void mce_init_timer(void) if (!next_interval) return; setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer(t); } @@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies_relative(jiffies + next_interval); + t->expires = round_jiffies(jiffies + next_interval); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; -- cgit v1.2.3 From bf5172d07ac38e538e01143289e9b46076494ad5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Mar 2009 22:04:45 +0100 Subject: x86: convert obsolete irq_desc_t typedef to struct irq_desc Impact: cleanup Convert the last remaining users. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/visws_quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 191a876e9e8..31ffc24eec4 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = { static irqreturn_t piix4_master_intr(int irq, void *dev_id) { int realirq; - irq_desc_t *desc; + struct irq_desc *desc; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); -- cgit v1.2.3 From 8229d754383e8cd905c38b56bd7365c7fc10dfc1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 11 Mar 2009 19:13:49 +0530 Subject: x86: cpu architecture debug code, build fix, cleanup move store_ldt outside the CONFIG_PARAVIRT section and also clean up the code a bit. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 0bdf4daba20..9abbcbd933c 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -427,7 +428,7 @@ static void print_tss(void *arg) seq_printf(seq, " CS\t: %04x\n", seg); asm("movl %%ds,%0" : "=r" (seg)); seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss); + seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); asm("movl %%es,%0" : "=r" (seg)); seq_printf(seq, " ES\t: %04x\n", seg); asm("movl %%fs,%0" : "=r" (seg)); -- cgit v1.2.3 From 9fa7266c2f0cdfcb09eacba2d3624bec7df78641 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 10:34:45 +0000 Subject: x86: remove leftover unwind annotations Impact: cleanup These got left in needlessly when ret_from_fork got simplified. Signed-off-by: Jan Beulich LKML-Reference: <49B8F355.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7ba4621c0df..54866bb5d38 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -416,7 +416,6 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%rcx) - CFI_REMEMBER_STATE RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -428,7 +427,6 @@ ENTRY(ret_from_fork) RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call # go to the SYSRET fastpath - CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) -- cgit v1.2.3 From c2810188c1b810c68139608a207befae0a4f1e69 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 10:38:55 +0000 Subject: x86-64: move save_paranoid into .kprobes.text Impact: mark save_paranoid as non-kprobe-able code This appears to be necessary as the function gets called from kprobes-unsafe exception handling stubs (i.e. which themselves live in .kprobes.text). Signed-off-by: Jan Beulich LKML-Reference: <49B8F44F.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54866bb5d38..a331ec38af9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -368,6 +368,7 @@ ENTRY(save_rest) END(save_rest) /* save complete stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -396,6 +397,7 @@ ENTRY(save_paranoid) 1: ret CFI_ENDPROC END(save_paranoid) + .popsection /* * A newly forked process directly context switches into this address. -- cgit v1.2.3 From 02dde8b45c5460794b9052d7c12939fe3eb63c2c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:08:49 +0000 Subject: x86: move various CPU initialization objects into .cpuinit.rodata Impact: debuggability and micro-optimization Putting whatever is possible into the (final) .rodata section increases the likelihood of catching memory corruption bugs early, and reduces false cache line sharing. Signed-off-by: Jan Beulich LKML-Reference: <49B90961.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/centaur_64.c | 2 +- arch/x86/kernel/cpu/common.c | 20 ++++++++++---------- arch/x86/kernel/cpu/cpu.h | 11 ++++++----- arch/x86/kernel/cpu/cyrix.c | 16 ++++++++-------- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- arch/x86/kernel/cpu/transmeta.c | 2 +- arch/x86/kernel/cpu/umc.c | 2 +- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- 12 files changed, 36 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 6882a735d9c..8220ae69849 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f47df59016c..7e4a459daa6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -502,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int } #endif -static struct cpu_dev amd_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 89bfdd9cacc..983e0830f0d 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -468,7 +468,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) return size; } -static struct cpu_dev centaur_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index a1625f5a1e7..51b09c48c9c 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -25,7 +25,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } -static struct cpu_dev centaur_cpu_dev __cpuinitdata = { +static const struct cpu_dev centaur_cpu_dev __cpuinitconst = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f8869978bbb..54cbe7690f9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -70,7 +70,7 @@ cpumask_t cpu_sibling_setup_map; #endif /* CONFIG_X86_32 */ -static struct cpu_dev *this_cpu __cpuinitdata; +static const struct cpu_dev *this_cpu __cpuinitdata; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -274,9 +274,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) */ /* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; + const struct cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -321,7 +321,7 @@ void switch_to_new_gdt(int cpu) load_percpu_segment(cpu); } -static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { @@ -340,7 +340,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) #endif } -static struct cpu_dev __cpuinitdata default_cpu = { +static const struct cpu_dev __cpuinitconst default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, @@ -634,12 +634,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { - struct cpu_dev **cdev; + const struct cpu_dev *const *cdev; int count = 0; printk("KERNEL supported cpus:\n"); for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { - struct cpu_dev *cpudev = *cdev; + const struct cpu_dev *cpudev = *cdev; unsigned int j; if (count >= X86_VENDOR_NUM) @@ -768,7 +768,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { - char *p; + const char *p; p = table_lookup_model(c); if (p) strcpy(c->x86_model_id, p); @@ -847,7 +847,7 @@ struct msr_range { unsigned max; }; -static struct msr_range msr_range_array[] __cpuinitdata = { +static const struct msr_range msr_range_array[] __cpuinitconst = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, @@ -894,7 +894,7 @@ __setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { - char *vendor = NULL; + const char *vendor = NULL; if (c->x86_vendor < X86_VENDOR_NUM) vendor = this_cpu->c_vendor; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index de4094a3921..9469ecb5aeb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -5,15 +5,15 @@ struct cpu_model_info { int vendor; int family; - char *model_names[16]; + const char *model_names[16]; }; /* attempt to consolidate cpu attributes */ struct cpu_dev { - char * c_vendor; + const char * c_vendor; /* some have two possibilities for cpuid string */ - char * c_ident[2]; + const char * c_ident[2]; struct cpu_model_info c_models[4]; @@ -25,11 +25,12 @@ struct cpu_dev { }; #define cpu_dev_register(cpu_devX) \ - static struct cpu_dev *__cpu_dev_##cpu_devX __used \ + static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __attribute__((__section__(".x86_cpu_dev.init"))) = \ &cpu_devX; -extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; +extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; extern void display_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index ffd0f5ed071..593171e967e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) */ static unsigned char Cx86_dir0_msb __cpuinitdata = 0; -static char Cx86_model[][9] __cpuinitdata = { +static const char __cpuinitconst Cx86_model[][9] = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static char Cx486_name[][5] __cpuinitdata = { +static const char __cpuinitconst Cx486_name[][5] = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static char Cx486S_name[][4] __cpuinitdata = { +static const char __cpuinitconst Cx486S_name[][4] = { "S", "S2", "Se", "S2e" }; -static char Cx486D_name[][4] __cpuinitdata = { +static const char __cpuinitconst Cx486D_name[][4] = { "DX", "DX2", "?", "?", "?", "DX4" }; static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; -static char cyrix_model_mult1[] __cpuinitdata = "12??43"; -static char cyrix_model_mult2[] __cpuinitdata = "12233445"; +static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; +static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) } } -static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_early_init = early_init_cyrix, @@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { cpu_dev_register(cyrix_cpu_dev); -static struct cpu_dev nsc_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 191117f1ad5..968f15129ed 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -410,7 +410,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i } #endif -static struct cpu_dev intel_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 7293508d8f5..c471eb1a389 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -32,7 +32,7 @@ struct _cache_table }; /* all the cache descriptor types we care about (no TLB or trace cache entries) */ -static struct _cache_table cache_table[] __cpuinitdata = +static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -206,15 +206,15 @@ union l3_cache { unsigned val; }; -static unsigned short assocs[] __cpuinitdata = { +static const unsigned short __cpuinitconst assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, [8] = 16, [0xa] = 32, [0xb] = 48, [0xc] = 64, [0xf] = 0xffff // ?? }; -static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; -static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; +static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; +static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 52b3fefbd5a..bb62b3e5caa 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_early_init = early_init_transmeta, diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index e777f79e096..fd2c37bf7ac 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -8,7 +8,7 @@ * so no special init takes place. */ -static struct cpu_dev umc_cpu_dev __cpuinitdata = { +static const struct cpu_dev __cpuinitconst umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 666e43df51f..712d15fdc41 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) return 0; } -static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { +static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { { .callback = set_check_enable_amd_mmconf, .ident = "Sun Microsystems Machine", -- cgit v1.2.3 From 7a81d9a7da03d2f27840d659f97ef140d032f609 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:45:15 +0000 Subject: x86: smarten /proc/interrupts output Impact: change /proc/interrupts output ABI With the number of interrupts on large systems growing, assumptions on the width an interrupt number requires when converted to a decimal string turn invalid. Therefore, calculate the maximum number of digits dynamically. Signed-off-by: Jan Beulich LKML-Reference: <49B911EB.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b864341dcc4..b8ac3b6cf77 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -45,16 +45,16 @@ void ack_bad_irq(unsigned int irq) /* * /proc/interrupts printing: */ -static int show_other_interrupts(struct seq_file *p) +static int show_other_interrupts(struct seq_file *p, int prec) { int j; - seq_printf(p, "NMI: "); + seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); + seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); @@ -66,40 +66,40 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP - seq_printf(p, "RES: "); + seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); + seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); + seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); + seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); # ifdef CONFIG_X86_64 - seq_printf(p, "THR: "); + seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); + seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); #endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif return 0; } @@ -107,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p) int show_interrupts(struct seq_file *p, void *v) { unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; + int i = *(loff_t *) v, j, prec; struct irqaction *action; struct irq_desc *desc; if (i > nr_irqs) return 0; + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + if (i == nr_irqs) - return show_other_interrupts(p); + return show_other_interrupts(p, prec); /* print header */ if (i == 0) { - seq_printf(p, " "); + seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); @@ -140,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!action && !any_count) goto out; - seq_printf(p, "%3d: ", i); + seq_printf(p, "%*d: ", prec, i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else -- cgit v1.2.3 From 13c6c53282d99c82e79b02477efd2c1e30a991ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:37:34 +0000 Subject: x86, 32-bit: also use cpuinfo_x86's x86_{phys,virt}_bits members Impact: 32/64-bit consolidation In a first step, this allows fixing phys_addr_valid() for PAE (which until now reported all addresses to be valid). Subsequently, this will also allow simplifying some MTRR handling code. Signed-off-by: Jan Beulich LKML-Reference: <49B9101E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 12 +++++++++++- arch/x86/kernel/cpu/intel.c | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c87627..a95e9480bb9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -549,13 +549,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) } } -#ifdef CONFIG_X86_64 if (c->extended_cpuid_level >= 0x80000008) { u32 eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; #endif if (c->extended_cpuid_level >= 0x80000007) @@ -602,8 +604,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; @@ -726,9 +732,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_coreid_bits = 0; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 191117f1ad5..ae769471042 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -54,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = 128; #endif + /* CPUID workaround for 0F33/0F34 CPU */ + if (c->x86 == 0xF && c->x86_model == 0x3 + && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + c->x86_phys_bits = 36; + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states -- cgit v1.2.3 From 9a50156a1c7bfa65315facaffdfbed6513fcfd3b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:41:23 +0000 Subject: x86: properly __init-annotate recent early_printk additions Impact: cleanup, save memory Don't keep code resident that's only needed during startup. Signed-off-by: Jan Beulich LKML-Reference: <49B91103.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 639ad98238a..335f049d110 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void) return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); } -static void dbgp_mdelay(int ms) +static void __init dbgp_mdelay(int ms) { int i; @@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size) writel(hi, &ehci_debug->data47); } -static void dbgp_get_data(void *buf, int size) +static void __init dbgp_get_data(void *buf, int size) { unsigned char *bytes = buf; u32 lo, hi; @@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, return ret; } -static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, +static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, int size) { u32 pids, addr, ctrl; @@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, return ret; } -static int dbgp_control_msg(unsigned devnum, int requesttype, int request, - int value, int index, void *data, int size) +static int __init dbgp_control_msg(unsigned devnum, int requesttype, + int request, int value, int index, void *data, int size) { u32 pids, addr, ctrl; struct usb_ctrlrequest req; @@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) return 0; } -static int ehci_reset_port(int port) +static int __init ehci_reset_port(int port) { u32 portsc; u32 delay_time, delay; @@ -532,7 +532,7 @@ static int ehci_reset_port(int port) return -EBUSY; } -static int ehci_wait_for_port(int port) +static int __init ehci_wait_for_port(int port) { u32 status; int ret, reps; @@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { } typedef void (*set_debug_port_t)(int port); -static void default_set_debug_port(int port) +static void __init default_set_debug_port(int port) { } -static set_debug_port_t set_debug_port = default_set_debug_port; +static set_debug_port_t __initdata set_debug_port = default_set_debug_port; -static void nvidia_set_debug_port(int port) +static void __init nvidia_set_debug_port(int port) { u32 dword; dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, -- cgit v1.2.3 From 82034d6f59b4772f4233bbb61c670290803a9960 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:57:10 +0000 Subject: x86: clean up output resulting from update_mptable option Impact: cleanup Without apic=verbose, using the update_mptable option would result in garbled and confusing output due to the inconsistent use of printk() vs apic_printk(). Signed-off-by: Jan Beulich LKML-Reference: <49B914B6.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e8192401da4..47673e02ae5 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -890,12 +890,12 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, #ifdef CONFIG_X86_IO_APIC struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - printk(KERN_INFO "OLD "); + apic_printk(APIC_VERBOSE, "OLD "); print_MP_intsrc_info(m); i = get_MP_intsrc_index(m); if (i > 0) { assign_to_mpc_intsrc(&mp_irqs[i], m); - printk(KERN_INFO "NEW "); + apic_printk(APIC_VERBOSE, "NEW "); print_mp_irq_info(&mp_irqs[i]); } else if (!i) { /* legacy, do nothing */ @@ -943,7 +943,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, continue; if (nr_m_spare > 0) { - printk(KERN_INFO "*NEW* found "); + apic_printk(APIC_VERBOSE, "*NEW* found\n"); nr_m_spare--; assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); m_spare[nr_m_spare] = NULL; -- cgit v1.2.3 From 5c0e6f035df983210e4d22213aed624ced502d3d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 13:07:23 +0000 Subject: x86: fix code paths used by update_mptable Impact: fix crashes under Xen due to unrobust e820 code find_e820_area_size() must return a properly distinguishable and out-of-bounds value when it fails, and -1UL does not meet that criteria on i386/PAE. Additionally, callers of the function must check against that value. early_reserve_e820() should be prepared for the region found to be outside of the addressable range on 32-bits. e820_update_range_map() should not blindly update e820, but should do all it work on the map it got a pointer passed for (which in 50% of the cases is &e820_saved). It must also not call e820_add_region(), as that again acts on e820 unconditionally. The issues were found when trying to make this option work in our Xen kernel (i.e. where some of the silent assumptions made in the code would not hold). Signed-off-by: Jan Beulich LKML-Reference: <49B9171B.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 2 +- arch/x86/kernel/e820.c | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 2ac0ab71412..b617b1164f1 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -83,7 +83,7 @@ void __init setup_bios_corruption_check(void) u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); - if (addr == 0) + if (!(addr + 1)) break; if ((addr + size) > corruption_check_size) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 508bec1cee2..3cf6681ac80 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -421,7 +421,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { - int i; + unsigned int i, x; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -429,7 +429,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; if (ei->type != old_type) @@ -446,14 +446,23 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, final_end = min(start + size, ei->addr + ei->size); if (final_start >= final_end) continue; - e820_add_region(final_start, final_end - final_start, - new_type); + + x = e820x->nr_map; + if (x == ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "Too many memory map entries!\n"); + break; + } + e820x->map[x].addr = final_start; + e820x->map[x].size = final_end - final_start; + e820x->map[x].type = new_type; + e820x->nr_map++; + real_updated_size += final_end - final_start; - ei->size -= final_end - final_start; if (ei->addr < final_start) continue; ei->addr = final_end; + ei->size -= final_end - final_start; } return real_updated_size; } @@ -1020,8 +1029,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) continue; return addr; } - return -1UL; + return -1ULL; } /* @@ -1034,13 +1043,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) u64 start; start = startt; - while (size < sizet) + while (size < sizet && (start + 1)) start = find_e820_area_size(start, &size, align); if (size < sizet) return 0; +#ifdef CONFIG_X86_32 + if (start >= MAXMEM) + return 0; + if (start + size > MAXMEM) + size = MAXMEM - start; +#endif + addr = round_down(start + size - sizet, align); + if (addr < start) + return 0; e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); -- cgit v1.2.3 From 8ad9790588ee2e69118b2b294ddab6f3f0379ad9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 18:43:54 -0700 Subject: x86: more MTRR debug printouts Impact: improve MTRR debugging messages There's still inefficiencies suspected with the MTRR sanitizing code, so make sure we get all the info we need from a dmesg. - Remove unneeded mtrr_show (It will only printout one time by first cpu, so it is no big deal.) - Also print out directly from get_mtrr, because it doesn't update mtrr_state. Signed-off-by: Yinghai Lu LKML-Reference: <49B9BA5A.40108@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 95 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0c0a455fe95..96440352010 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -33,14 +33,6 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state = {}; EXPORT_SYMBOL_GPL(mtrr_state); -static int __initdata mtrr_show; -static int __init mtrr_debug(char *opt) -{ - mtrr_show = 1; - return 0; -} -early_param("mtrr.show", mtrr_debug); - /* * Returns the effective MTRR type for the region * Error returns: @@ -193,13 +185,51 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO "MTRR %05X-%05X %s\n", + printk(KERN_INFO " %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); } static void prepare_set(void); static void post_set(void); +static void __init print_mtrr_state(void) +{ + unsigned int i; + int high_width; + + printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + } + printk(KERN_INFO "MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); + high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + printk(KERN_INFO " %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + printk(KERN_INFO " %u disabled\n", i); + } + if (mtrr_tom2) { + printk(KERN_INFO "TOM2: %016llx aka %lldM\n", + mtrr_tom2, mtrr_tom2>>20); + } +} + /* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { @@ -231,41 +261,9 @@ void __init get_mtrr_state(void) mtrr_tom2 |= low; mtrr_tom2 &= 0xffffff800000ULL; } - if (mtrr_show) { - int high_width; - - printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); - if (mtrr_state.have_fixed) { - printk(KERN_INFO "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); - print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); - for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); - for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); - } - printk(KERN_INFO "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); - high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; - for (i = 0; i < num_var_ranges; ++i) { - if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); - else - printk(KERN_INFO "MTRR %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_INFO "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); - } - } + + print_mtrr_state(); + mtrr_state_set = 1; /* PAT setup for BP. We need to go through sync steps here */ @@ -377,7 +375,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; + /* + * get_mtrr doesn't need to update mtrr_state, also it could be called + * from any cpu, so try to print it out directly. + */ rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + if ((mask_lo & 0x800) == 0) { /* Invalid (i.e. free) range */ *base = 0; @@ -407,6 +410,10 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *size = -mask_lo; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; + + printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n", + smp_processor_id(), reg, *base, *size, + mtrr_attrib_to_str(*type & 0xff)); } /** -- cgit v1.2.3 From c1ab7e93c6ddf8a068719b97b7e26c3d8eba7c32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Mar 2009 20:05:46 -0700 Subject: x86: print out mtrr_range_state when user specify size Impact: print more debug info Keep it consistent with autodetect version. Signed-off-by: Yinghai Lu LKML-Reference: <49B87C0A.4010105@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 236a401b825..311a4734609 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1424,6 +1424,8 @@ static int __init mtrr_cleanup(unsigned address_bits) if (!result[i].bad) { set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " -- cgit v1.2.3 From 0d890355bff25e1dc03a577a90ed80741489ca54 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Mar 2009 20:07:39 -0700 Subject: x86: separate mtrr cleanup/mtrr_e820 trim to separate file Impact: cleanup mtrr main.c is too big, seperate mtrr cleanup and mtrr e820 trim code to another file. Signed-off-by: Yinghai Lu LKML-Reference: <49B87C7B.80809@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/cleanup.c | 1089 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mtrr/main.c | 1055 +--------------------------------- arch/x86/kernel/cpu/mtrr/mtrr.h | 3 + 4 files changed, 1094 insertions(+), 1055 deletions(-) create mode 100644 arch/x86/kernel/cpu/mtrr/cleanup.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index 191fc053364..f4361b56f8e 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o +obj-y := main.o if.o generic.o state.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c new file mode 100644 index 00000000000..58b58bbf7eb --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -0,0 +1,1089 @@ +/* MTRR (Memory Type Range Register) cleanup + + Copyright (C) 2009 Yinghai Lu + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "mtrr.h" + +/* should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +struct res_range { + unsigned long start; + unsigned long end; +}; + +static int __init +add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + /* out of slots */ + if (nr_range >= RANGE_NUM) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +static int __init +add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + int i; + + /* try to merge it with old one */ + for (i = 0; i < nr_range; i++) { + unsigned long final_start, final_end; + unsigned long common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + + /* need to add that */ + return add_range(range, nr_range, start, end); +} + +static void __init +subtract_range(struct res_range *range, unsigned long start, unsigned long end) +{ + int i, j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int __init cmp_range(const void *x1, const void *x2) +{ + const struct res_range *r1 = x1; + const struct res_range *r2 = x2; + long start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static int __initdata debug_print; + +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, nr_range, base, + base + size - 1); + } + if (debug_print) { + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, + extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + if (debug_print) { + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + if (debug_print) { + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } + + /* clear those is not used */ + for (i = nr_range; i < RANGE_NUM; i++) + memset(&range[i], 0, sizeof(range[i])); + + return nr_range; +} + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +{ + unsigned long sum; + int i; + + sum = 0; + for (i = 0; i < nr_range; i++) + sum += range[i].end + 1 - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned int address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + u64 base, mask; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); + + base = ((u64)basek) << 10; + + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; + + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init +set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + char factor; + unsigned long base = sizek; + + if (base & ((1<<10) - 1)) { + /* not MB alignment */ + factor = 'K'; + } else if (base & ((1<<20) - 1)) { + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + + /* Compute the maximum size I can make a range */ + if (range_startk) + max_align = ffs(range_startk) - 1; + else + max_align = 32; + align = fls(range_sizek) - 1; + if (align > max_align) + align = max_align; + + sizek = 1 << align; + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, + &start_factor), + size_base = to_size_factor(sizek, &size_factor), + + printk(KERN_DEBUG "Setting variable MTRR %d, " + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + save_var_mtrr(reg++, range_startk, sizek, type); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long second_basek, second_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + second_basek = 0; + second_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* align with gran size, prevent small block used up MTRRs */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); + + while (range_sizek > state->range_sizek) { + range_sizek -= gran_sizek; + if (!range_sizek) + return 0; + } + state->range_sizek = range_sizek; + + /* try to append some small hole */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* no increase */ + if (range0_sizek == state->range_sizek) { + if (debug_print) + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + /* only cut back, when it is not the last */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* one hole in the middle */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* one hole in middle or at end */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* hole size should be less than half of range0 size */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } + } + + if (range0_sizek) { + if (debug_print) + printk(KERN_DEBUG "range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { + /* need to handle left over */ + range_sizek = state->range_sizek - range0_sizek; + + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; +} + +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) +{ + unsigned long basek, sizek; + unsigned long second_sizek = 0; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range */ + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs */ + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); + + /* Allocate an msr */ + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; +} + +/* mininum size of mtrr block that can take hole */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* granity of mtrr of block */ +static u64 mtrr_gran_size __initdata; + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static int nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init +x86_setup_var_mtrrs(struct res_range *range, int nr_range, + u64 chunk_size, u64 gran_size) +{ + struct var_mtrr_state var_state; + int i; + int num_reg; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); + + /* Write the range etc */ + for (i = 0; i < nr_range; i++) + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start + 1); + + /* Write the last range */ + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + + num_reg = var_state.reg; + /* Clear out the extra MTRR's */ + while (var_state.reg < num_var_ranges) { + save_var_mtrr(var_state.reg, 0, 0, 0); + var_state.reg++; + } + + return num_reg; +} + +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; + +/* + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 + */ +#define NUM_RESULT 136 +#define PSHIFT (PAGE_SHIFT - 10) + +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; + +static void __init print_out_mtrr_range_state(void) +{ + int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + size = range_state[i].size_pfn; + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + if (type == MTRR_TYPE_WRPROT) + type = MTRR_TYPE_UNCACHABLE; + num[type]++; + } + + /* check if we got UC entries */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + return 1; +} + +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; + } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); + + memset(range, 0, sizeof(range)); + extra_remove_size = 0; + extra_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) + extra_remove_size = + (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, + extra_remove_size); + /* + * [0, 1M) should always be coverred by var mtrr with WB + * and fixed mtrrs should take effective before var mtrr for it + */ + nr_range = add_range_with_merge(range, nr_range, 0, + (1ULL<<(20 - PAGE_SHIFT)) - 1); + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + + range_sums = sum_ranges(range, nr_range); + printk(KERN_INFO "total RAM coverred: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); + + mtrr_print_out_one_result(i); + + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } + printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " + "will find optimal one\n"); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); + chunk_size <<= 1) { + + if (i >= NUM_RESULT) + continue; + + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); + } + + i++; + } + } + + /* try to find the optimal index */ + index_good = mtrr_search_optimal_index(); + + if (index_good != -1) { + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + i = index_good; + mtrr_print_out_one_result(i); + + /* convert ranges to var ranges state */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); + } + + printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); + printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; +} +#else +int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +static u64 __init real_trim_memory(unsigned long start_pfn, + unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + + return e820_update_range(trim_start, trim_size, E820_RAM, + E820_RESERVED); +} +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_pfn = 0, def, dummy; + mtrr_type type; + u64 total_trim_size; + + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + if (highest_pfn < base + size) + highest_pfn = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + if (!highest_pfn) { + printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); + return 0; + } + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type >= MTRR_NUM_TYPES) + continue; + size = range_state[i].size_pfn; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* no entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; + if (highest_pfn < range[nr_range].end + 1) + highest_pfn = range[nr_range].end + 1; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + total_trim_size = 0; + /* check the head */ + if (range[0].start) + total_trim_size += real_trim_memory(0, range[0].start); + /* check the holes */ + for (i = 0; i < nr_range - 1; i++) { + if (range[i].end + 1 < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end + 1, + range[i+1].start); + } + /* check the top */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) + total_trim_size += real_trim_memory(range[i].end + 1, + end_pfn); + + if (total_trim_size) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); + + if (!changed_by_mtrr_cleanup) + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr\n"); + update_e820(); + + return 1; + } + + return 0; +} + diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 311a4734609..5c2e266f41d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -610,1060 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; -/* should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 - -struct res_range { - unsigned long start; - unsigned long end; -}; - -static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) -{ - /* out of slots */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) -{ - int i; - - /* try to merge it with old one */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* need to add that */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; - -static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, - unsigned long extra_remove_base, - unsigned long extra_remove_size) -{ - unsigned long i, base, size; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_WRBACK) - continue; - base = range_state[i].base_pfn; - size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); - } - if (debug_print) { - printk(KERN_DEBUG "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* take out UC ranges */ - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_UNCACHABLE && - type != MTRR_TYPE_WRPROT) - continue; - size = range_state[i].size_pfn; - if (!size) - continue; - base = range_state[i].base_pfn; - subtract_range(range, base, base + size - 1); - } - if (extra_remove_size) - subtract_range(range, extra_remove_base, - extra_remove_base + extra_remove_size - 1); - - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } - if (debug_print) { - printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - if (debug_print) { - printk(KERN_DEBUG "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); - } - - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - - return nr_range; -} - -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - -#ifdef CONFIG_MTRR_SANITIZER - -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) -{ - unsigned long sum; - int i; - - sum = 0; - for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; - - return sum; -} - -static int enable_mtrr_cleanup __initdata = - CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; - -static int __init disable_mtrr_cleanup_setup(char *str) -{ - enable_mtrr_cleanup = 0; - return 0; -} -early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); - -static int __init enable_mtrr_cleanup_setup(char *str) -{ - enable_mtrr_cleanup = 1; - return 0; -} -early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); - -static int __init mtrr_cleanup_debug_setup(char *str) -{ - debug_print = 1; - return 0; -} -early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); - -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - -static void __init -set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) -{ - u32 base_lo, base_hi, mask_lo, mask_hi; - u64 base, mask; - - if (!sizek) { - fill_mtrr_var_range(reg, 0, 0, 0, 0); - return; - } - - mask = (1ULL << address_bits) - 1; - mask &= ~((((u64)sizek) << 10) - 1); - - base = ((u64)basek) << 10; - - base |= type; - mask |= 0x800; - - base_lo = base & ((1ULL<<32) - 1); - base_hi = base >> 32; - - mask_lo = mask & ((1ULL<<32) - 1); - mask_hi = mask >> 32; - - fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); -} - -static void __init -save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) -{ - range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); - range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); - range_state[reg].type = type; -} - -static void __init -set_var_mtrr_all(unsigned int address_bits) -{ - unsigned long basek, sizek; - unsigned char type; - unsigned int reg; - - for (reg = 0; reg < num_var_ranges; reg++) { - basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); - sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); - type = range_state[reg].type; - - set_var_mtrr(reg, basek, sizek, type, address_bits); - } -} - -static unsigned long to_size_factor(unsigned long sizek, char *factorp) -{ - char factor; - unsigned long base = sizek; - - if (base & ((1<<10) - 1)) { - /* not MB alignment */ - factor = 'K'; - } else if (base & ((1<<20) - 1)){ - factor = 'M'; - base >>= 10; - } else { - factor = 'G'; - base >>= 20; - } - - *factorp = factor; - - return base; -} - -static unsigned int __init -range_to_mtrr(unsigned int reg, unsigned long range_startk, - unsigned long range_sizek, unsigned char type) -{ - if (!range_sizek || (reg >= num_var_ranges)) - return reg; - - while (range_sizek) { - unsigned long max_align, align; - unsigned long sizek; - - /* Compute the maximum size I can make a range */ - if (range_startk) - max_align = ffs(range_startk) - 1; - else - max_align = 32; - align = fls(range_sizek) - 1; - if (align > max_align) - align = max_align; - - sizek = 1 << align; - if (debug_print) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; - - start_base = to_size_factor(range_startk, &start_factor), - size_base = to_size_factor(sizek, &size_factor), - - printk(KERN_DEBUG "Setting variable MTRR %d, " - "base: %ld%cB, range: %ld%cB, type %s\n", - reg, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE)?"UC": - ((type == MTRR_TYPE_WRBACK)?"WB":"Other") - ); - } - save_var_mtrr(reg++, range_startk, sizek, type); - range_startk += sizek; - range_sizek -= sizek; - if (reg >= num_var_ranges) - break; - } - return reg; -} - -static unsigned __init -range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, - unsigned long sizek) -{ - unsigned long hole_basek, hole_sizek; - unsigned long second_basek, second_sizek; - unsigned long range0_basek, range0_sizek; - unsigned long range_basek, range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - - hole_basek = 0; - hole_sizek = 0; - second_basek = 0; - second_sizek = 0; - chunk_sizek = state->chunk_sizek; - gran_sizek = state->gran_sizek; - - /* align with gran size, prevent small block used up MTRRs */ - range_basek = ALIGN(state->range_startk, gran_sizek); - if ((range_basek > basek) && basek) - return second_sizek; - state->range_sizek -= (range_basek - state->range_startk); - range_sizek = ALIGN(state->range_sizek, gran_sizek); - - while (range_sizek > state->range_sizek) { - range_sizek -= gran_sizek; - if (!range_sizek) - return 0; - } - state->range_sizek = range_sizek; - - /* try to append some small hole */ - range0_basek = state->range_startk; - range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - - /* no increase */ - if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - state->range_sizek, MTRR_TYPE_WRBACK); - return 0; - } - - /* only cut back, when it is not the last */ - if (sizek) { - while (range0_basek + range0_sizek > (basek + sizek)) { - if (range0_sizek >= chunk_sizek) - range0_sizek -= chunk_sizek; - else - range0_sizek = 0; - - if (!range0_sizek) - break; - } - } - -second_try: - range_basek = range0_basek + range0_sizek; - - /* one hole in the middle */ - if (range_basek > basek && range_basek <= (basek + sizek)) - second_sizek = range_basek - basek; - - if (range0_sizek > state->range_sizek) { - - /* one hole in middle or at end */ - hole_sizek = range0_sizek - state->range_sizek - second_sizek; - - /* hole size should be less than half of range0 size */ - if (hole_sizek >= (range0_sizek >> 1) && - range0_sizek >= chunk_sizek) { - range0_sizek -= chunk_sizek; - second_sizek = 0; - hole_sizek = 0; - - goto second_try; - } - } - - if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - range0_sizek, MTRR_TYPE_WRBACK); - } - - if (range0_sizek < state->range_sizek) { - /* need to handle left over */ - range_sizek = state->range_sizek - range0_sizek; - - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, - range_sizek, MTRR_TYPE_WRBACK); - } - - if (hole_sizek) { - hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, - hole_sizek, MTRR_TYPE_UNCACHABLE); - } - - return second_sizek; -} - -static void __init -set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, - unsigned long size_pfn) -{ - unsigned long basek, sizek; - unsigned long second_sizek = 0; - - if (state->reg >= num_var_ranges) - return; - - basek = base_pfn << (PAGE_SHIFT - 10); - sizek = size_pfn << (PAGE_SHIFT - 10); - - /* See if I can merge with the last range */ - if ((basek <= 1024) || - (state->range_startk + state->range_sizek == basek)) { - unsigned long endk = basek + sizek; - state->range_sizek = endk - state->range_startk; - return; - } - /* Write the range mtrrs */ - if (state->range_sizek != 0) - second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - - /* Allocate an msr */ - state->range_startk = basek + second_sizek; - state->range_sizek = sizek - second_sizek; -} - -/* mininum size of mtrr block that can take hole */ -static u64 mtrr_chunk_size __initdata = (256ULL<<20); - -static int __init parse_mtrr_chunk_size_opt(char *p) -{ - if (!p) - return -EINVAL; - mtrr_chunk_size = memparse(p, &p); - return 0; -} -early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); - -/* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata; - -static int __init parse_mtrr_gran_size_opt(char *p) -{ - if (!p) - return -EINVAL; - mtrr_gran_size = memparse(p, &p); - return 0; -} -early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); - -static int nr_mtrr_spare_reg __initdata = - CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; - -static int __init parse_mtrr_spare_reg(char *arg) -{ - if (arg) - nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); - -static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, - u64 chunk_size, u64 gran_size) -{ - struct var_mtrr_state var_state; - int i; - int num_reg; - - var_state.range_startk = 0; - var_state.range_sizek = 0; - var_state.reg = 0; - var_state.chunk_sizek = chunk_size >> 10; - var_state.gran_sizek = gran_size >> 10; - - memset(range_state, 0, sizeof(range_state)); - - /* Write the range etc */ - for (i = 0; i < nr_range; i++) - set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); - - /* Write the last range */ - if (var_state.range_sizek != 0) - range_to_mtrr_with_hole(&var_state, 0, 0); - - num_reg = var_state.reg; - /* Clear out the extra MTRR's */ - while (var_state.reg < num_var_ranges) { - save_var_mtrr(var_state.reg, 0, 0, 0); - var_state.reg++; - } - - return num_reg; -} - -struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; -}; - -/* - * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G - * chunk size: gran_size, ..., 2G - * so we need (1+16)*8 - */ -#define NUM_RESULT 136 -#define PSHIFT (PAGE_SHIFT - 10) - -static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static unsigned long __initdata min_loss_pfn[RANGE_NUM]; - -static void __init print_out_mtrr_range_state(void) -{ - int i; - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; - - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; - - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); - } -} - -static int __init mtrr_need_cleanup(void) -{ - int i; - mtrr_type type; - unsigned long size; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; - - /* check entries number */ - memset(num, 0, sizeof(num)); - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - size = range_state[i].size_pfn; - if (type >= MTRR_NUM_TYPES) - continue; - if (!size) - type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; - num[type]++; - } - - /* check if we got UC entries */ - if (!num[MTRR_TYPE_UNCACHABLE]) - return 0; - - /* check if we only had WB and UC */ - if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) - return 0; - - return 1; -} - -static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) -{ - int num_reg; - static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; - unsigned long range_sums_new; - - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; - } - - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } -} - -static void __init mtrr_print_out_one_result(int i) -{ - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); -} - -static int __init mtrr_search_optimal_index(void) -{ - int i; - int num_reg_good; - int index_good; - - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } - - return index_good; -} - - -static int __init mtrr_cleanup(unsigned address_bits) -{ - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - u64 chunk_size, gran_size; - int index_good; - int i; - - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; - - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; - } - - /* check if we need handle it and can handle it */ - if (!mtrr_need_cleanup()) - return 0; - - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - print_out_mtrr_range_state(); - - memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); - if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); - /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it - */ - nr_range = add_range_with_merge(range, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", - range_sums >> (20 - PAGE_SHIFT)); - - if (mtrr_chunk_size && mtrr_gran_size) { - i = 0; - mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); - - mtrr_print_out_one_result(i); - - if (!result[i].bad) { - set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); - print_out_mtrr_range_state(); - return 1; - } - printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " - "will find optimal one\n"); - } - - i = 0; - memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); - memset(result, 0, sizeof(result)); - for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - - for (chunk_size = gran_size; chunk_size < (1ULL<<32); - chunk_size <<= 1) { - - if (i >= NUM_RESULT) - continue; - - mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); - if (debug_print) { - mtrr_print_out_one_result(i); - printk(KERN_INFO "\n"); - } - - i++; - } - } - - /* try to find the optimal index */ - index_good = mtrr_search_optimal_index(); - - if (index_good != -1) { - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); - i = index_good; - mtrr_print_out_one_result(i); - - /* convert ranges to var ranges state */ - chunk_size = result[i].chunk_sizek; - chunk_size <<= 10; - gran_size = result[i].gran_sizek; - gran_size <<= 10; - x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - set_var_mtrr_all(address_bits); - printk(KERN_DEBUG "New variable MTRRs\n"); - print_out_mtrr_range_state(); - return 1; - } else { - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) - mtrr_print_out_one_result(i); - } - - printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); - printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); - - return 0; -} -#else -static int __init mtrr_cleanup(unsigned address_bits) -{ - return 0; -} -#endif - -static int __initdata changed_by_mtrr_cleanup; - -static int disable_mtrr_trim; - -static int __init disable_mtrr_trim_setup(char *str) -{ - disable_mtrr_trim = 1; - return 0; -} -early_param("disable_mtrr_trim", disable_mtrr_trim_setup); - -/* - * Newer AMD K8s and later CPUs have a special magic MSR way to force WB - * for memory >4GB. Check for that here. - * Note this won't check if the MTRRs < 4GB where the magic bit doesn't - * apply to are wrong, but so far we don't know of any such case in the wild. - */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) - -int __init amd_special_default_mtrr(void) -{ - u32 l, h; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) - return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) - return 0; - /* - * Memory between 4GB and top of mem is forced WB by this magic bit. - * Reserved before K8RevF, but should be zero there. - */ - if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == - (Tom2Enabled | Tom2ForceMemTypeWB)) - return 1; - return 0; -} - -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) -{ - u64 trim_start, trim_size; - trim_start = start_pfn; - trim_start <<= PAGE_SHIFT; - trim_size = limit_pfn; - trim_size <<= PAGE_SHIFT; - trim_size -= trim_start; - - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); -} -/** - * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs - * @end_pfn: ending page frame number - * - * Some buggy BIOSes don't setup the MTRRs properly for systems with certain - * memory configurations. This routine checks that the highest MTRR matches - * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any - * memory off the end by adjusting end_pfn, removing it from the kernel's - * allocation pools, warning the user with an obnoxious message. - */ -int __init mtrr_trim_uncached_memory(unsigned long end_pfn) -{ - unsigned long i, base, size, highest_pfn = 0, def, dummy; - mtrr_type type; - u64 total_trim_size; - - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; - /* - * Make sure we only trim uncachable memory on machines that - * support the Intel MTRR architecture: - */ - if (!is_cpu(INTEL) || disable_mtrr_trim) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; - - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; - } - - /* Find highest cached pfn */ - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type != MTRR_TYPE_WRBACK) - continue; - base = range_state[i].base_pfn; - size = range_state[i].size_pfn; - if (highest_pfn < base + size) - highest_pfn = base + size; - } - - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_pfn) { - printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); - return 0; - } - - /* check entries number */ - memset(num, 0, sizeof(num)); - for (i = 0; i < num_var_ranges; i++) { - type = range_state[i].type; - if (type >= MTRR_NUM_TYPES) - continue; - size = range_state[i].size_pfn; - if (!size) - type = MTRR_NUM_TYPES; - num[type]++; - } - - /* no entry for WB? */ - if (!num[MTRR_TYPE_WRBACK]) - return 0; - - /* check if we only had WB and UC */ - if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) - return 0; - - memset(range, 0, sizeof(range)); - nr_range = 0; - if (mtrr_tom2) { - range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; - nr_range++; - } - nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - - total_trim_size = 0; - /* check the head */ - if (range[0].start) - total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ - for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, - range[i+1].start); - } - /* check the top */ - i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, - end_pfn); - - if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); - - if (!changed_by_mtrr_cleanup) - WARN_ON(1); - - printk(KERN_INFO "update e820 for mtrr\n"); - update_e820(); - - return 1; - } - - return 0; -} +int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index ffd60409cc6..6710e93021a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -88,3 +88,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned); int amd_init_mtrr(void); int cyrix_init_mtrr(void); int centaur_init_mtrr(void); + +extern int changed_by_mtrr_cleanup; +extern int mtrr_cleanup(unsigned address_bits); -- cgit v1.2.3 From 91219bcbdcccc1686b0ecce09e28825c93619c07 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 12 Mar 2009 02:37:00 +0530 Subject: x86: cpu_debug add write support for MSRs Supported write flag for registers. currently write is enabled only for PMC MSR. [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x0 [root@ht]# echo 1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x4d2 [root@ht]# echo 0x1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value [root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value 0x1234 Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 118 +++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 9abbcbd933c..21c0cf8ced1 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -40,41 +41,41 @@ static DEFINE_MUTEX(cpu_debug_lock); static struct dentry *cpu_debugfs_dir; static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC }, /* Machine Check */ - { "monitor", CPU_MONITOR }, /* Monitor */ - { "time", CPU_TIME }, /* Time */ - { "pmc", CPU_PMC }, /* Performance Monitor */ - { "platform", CPU_PLATFORM }, /* Platform */ - { "apic", CPU_APIC }, /* APIC */ - { "poweron", CPU_POWERON }, /* Power-on */ - { "control", CPU_CONTROL }, /* Control */ - { "features", CPU_FEATURES }, /* Features control */ - { "lastbranch", CPU_LBRANCH }, /* Last Branch */ - { "bios", CPU_BIOS }, /* BIOS */ - { "freq", CPU_FREQ }, /* Frequency */ - { "mtrr", CPU_MTRR }, /* MTRR */ - { "perf", CPU_PERF }, /* Performance */ - { "cache", CPU_CACHE }, /* Cache */ - { "sysenter", CPU_SYSENTER }, /* Sysenter */ - { "therm", CPU_THERM }, /* Thermal */ - { "misc", CPU_MISC }, /* Miscellaneous */ - { "debug", CPU_DEBUG }, /* Debug */ - { "pat", CPU_PAT }, /* PAT */ - { "vmx", CPU_VMX }, /* VMX */ - { "call", CPU_CALL }, /* System Call */ - { "base", CPU_BASE }, /* BASE Address */ - { "smm", CPU_SMM }, /* System mgmt mode */ - { "svm", CPU_SVM }, /*Secure Virtial Machine*/ - { "osvm", CPU_OSVM }, /* OS-Visible Workaround*/ - { "tss", CPU_TSS }, /* Task Stack Segment */ - { "cr", CPU_CR }, /* Control Registers */ - { "dt", CPU_DT }, /* Descriptor Table */ - { "registers", CPU_REG_ALL }, /* Select all Registers */ + { "mc", CPU_MC, 0 }, + { "monitor", CPU_MONITOR, 0 }, + { "time", CPU_TIME, 0 }, + { "pmc", CPU_PMC, 1 }, + { "platform", CPU_PLATFORM, 0 }, + { "apic", CPU_APIC, 0 }, + { "poweron", CPU_POWERON, 0 }, + { "control", CPU_CONTROL, 0 }, + { "features", CPU_FEATURES, 0 }, + { "lastbranch", CPU_LBRANCH, 0 }, + { "bios", CPU_BIOS, 0 }, + { "freq", CPU_FREQ, 0 }, + { "mtrr", CPU_MTRR, 0 }, + { "perf", CPU_PERF, 0 }, + { "cache", CPU_CACHE, 0 }, + { "sysenter", CPU_SYSENTER, 0 }, + { "therm", CPU_THERM, 0 }, + { "misc", CPU_MISC, 0 }, + { "debug", CPU_DEBUG, 0 }, + { "pat", CPU_PAT, 0 }, + { "vmx", CPU_VMX, 0 }, + { "call", CPU_CALL, 0 }, + { "base", CPU_BASE, 0 }, + { "smm", CPU_SMM, 0 }, + { "svm", CPU_SVM, 0 }, + { "osvm", CPU_OSVM, 0 }, + { "tss", CPU_TSS, 0 }, + { "cr", CPU_CR, 0 }, + { "dt", CPU_DT, 0 }, + { "registers", CPU_REG_ALL, 0 }, }; static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL }, /* index */ - { "value", CPU_REG_ALL }, /* value */ + { "index", CPU_REG_ALL, 0 }, + { "value", CPU_REG_ALL, 1 }, }; /* Intel Registers Range */ @@ -608,9 +609,62 @@ static int cpu_seq_open(struct inode *inode, struct file *file) return err; } +static int write_msr(struct cpu_private *priv, u64 val) +{ + u32 low, high; + + high = (val >> 32) & 0xffffffff; + low = val & 0xffffffff; + + if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) + return 0; + + return -EPERM; +} + +static int write_cpu_register(struct cpu_private *priv, const char *buf) +{ + int ret = -EPERM; + u64 val; + + ret = strict_strtoull(buf, 0, &val); + if (ret < 0) + return ret; + + /* Supporting only MSRs */ + if (priv->type < CPU_TSS_BIT) + return write_msr(priv, val); + + return ret; +} + +static ssize_t cpu_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct cpu_private *priv = seq->private; + char buf[19]; + + if ((priv == NULL) || (count >= sizeof(buf))) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) + if (!write_cpu_register(priv, buf)) + return count; + + return -EACCES; +} + static const struct file_operations cpu_fops = { + .owner = THIS_MODULE, .open = cpu_seq_open, .read = seq_read, + .write = cpu_write, .llseek = seq_lseek, .release = seq_release, }; -- cgit v1.2.3 From 773e673de27297d07d852e7e9bfd1a695cae1da2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 21:35:18 -0700 Subject: x86: fix e820_update_range() Impact: fix left range size on head | commit 5c0e6f035df983210e4d22213aed624ced502d3d | x86: fix code paths used by update_mptable | Impact: fix crashes under Xen due to unrobust e820 code fixes one e820 bug, but introduces another bug. Need to update size for left range at first in case it is header. also add __e820_add_region take more parameter. Signed-off-by: Yinghai Lu Cc: jbeulich@novell.com LKML-Reference: <49B9E286.502@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3cf6681ac80..95b81c18b6b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -110,19 +110,25 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) /* * Add a memory region to the kernel e820 map. */ -void __init e820_add_region(u64 start, u64 size, int type) +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) { - int x = e820.nr_map; + int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820.map)) { + if (x == ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); } void __init e820_print_map(char *who) @@ -417,11 +423,11 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map) return __append_e820_map(biosmap, nr_map); } -static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { - unsigned int i, x; + unsigned int i; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -447,22 +453,19 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, if (final_start >= final_end) continue; - x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Too many memory map entries!\n"); - break; - } - e820x->map[x].addr = final_start; - e820x->map[x].size = final_end - final_start; - e820x->map[x].type = new_type; - e820x->nr_map++; + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); real_updated_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; if (ei->addr < final_start) continue; ei->addr = final_end; - ei->size -= final_end - final_start; } return real_updated_size; } @@ -470,13 +473,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return e820_update_range_map(&e820, start, size, old_type, new_type); + return __e820_update_range(&e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return e820_update_range_map(&e820_saved, start, size, old_type, + return __e820_update_range(&e820_saved, start, size, old_type, new_type); } -- cgit v1.2.3 From 3ff42da5048649503e343a32be37b14a6a4e8aaf Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 12 Mar 2009 17:39:37 +0100 Subject: x86: mtrr: don't modify RdDram/WrDram bits of fixed MTRRs Impact: bug fix + BIOS workaround BIOS is expected to clear the SYSCFG[MtrrFixDramModEn] on AMD CPUs after fixed MTRRs are configured. Some BIOSes do not clear SYSCFG[MtrrFixDramModEn] on BP (and on APs). This can lead to obfuscation in Linux when this bit is not cleared on BP but cleared on APs. A consequence of this is that the saved fixed-MTRR state (from BP) differs from the fixed-MTRRs of APs -- because RdDram/WrDram bits are read as zero when SYSCFG[MtrrFixDramModEn] is cleared -- and Linux tries to sync fixed-MTRR state from BP to AP. This implies that Linux sets SYSCFG[MtrrFixDramEn] and activates those bits. More important is that (some) systems change these bits in SMM when ACPI is enabled. Hence it is racy if Linux modifies RdMem/WrMem bits, too. (1) The patch modifies an old fix from Bernhard Kaindl to get suspend/resume working on some Acer Laptops. Bernhard's patch tried to sync RdMem/WrMem bits of fixed MTRR registers and that helped on those old Laptops. (Don't ask me why -- can't test it myself). But this old problem was not the motivation for the patch. (See http://lkml.org/lkml/2007/4/3/110) (2) The more important effect is to fix issues on some more current systems. On those systems Linux panics or just freezes, see http://bugzilla.kernel.org/show_bug.cgi?id=11541 (and also duplicates of this bug: http://bugzilla.kernel.org/show_bug.cgi?id=11737 http://bugzilla.kernel.org/show_bug.cgi?id=11714) The affected systems boot only using acpi=ht, acpi=off or when the kernel is built with CONFIG_MTRR=n. The acpi options prevent full enablement of ACPI. Obviously when ACPI is enabled the BIOS/SMM modfies RdMem/WrMem bits. When CONFIG_MTRR=y Linux also accesses and modifies those bits when it needs to sync fixed-MTRRs across cores (Bernhard's fix, see (1)). How do you synchronize that? You can't. As a consequence Linux shouldn't touch those bits at all (Rationale are AMD's BKDGs which recommend to clear the bit that makes RdMem/WrMem accessible). This is the purpose of this patch. And (so far) this suffices to fix (1) and (2). I suggest not to touch RdDram/WrDram bits of fixed-MTRRs and SYSCFG[MtrrFixDramEn] and to clear SYSCFG[MtrrFixDramModEn] as suggested by AMD K8, and AMD family 10h/11h BKDGs. BIOS is expected to do this anyway. This should avoid that Linux and SMM tread on each other's toes ... Signed-off-by: Andreas Herrmann Cc: trenn@suse.de Cc: Yinghai Lu LKML-Reference: <20090312163937.GH20716@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 51 ++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 96440352010..950c434f793 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -33,6 +33,32 @@ u64 mtrr_tom2; struct mtrr_state_type mtrr_state = {}; EXPORT_SYMBOL_GPL(mtrr_state); +/** + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" (26094 Rev. 3.30 February 2006), section + * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set + * to 1 during BIOS initalization of the fixed MTRRs, then cleared to + * 0 for operation." + */ +static inline void k8_check_syscfg_dram_mod_en(void) +{ + u32 lo, hi; + + if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0x0f))) + return; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { + printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + " not cleared by BIOS, clearing this bit\n", + smp_processor_id()); + lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; + mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + } +} + /* * Returns the effective MTRR type for the region * Error returns: @@ -166,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs) unsigned int *p = (unsigned int *) frs; int i; + k8_check_syscfg_dram_mod_en(); + rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); for (i = 0; i < 2; i++) @@ -305,28 +333,11 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } -/** - * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs - * see AMD publication no. 24593, chapter 3.2.1 for more information - */ -static inline void k8_enable_fixed_iorrs(void) -{ - unsigned lo, hi; - - rdmsr(MSR_K8_SYSCFG, lo, hi); - mtrr_wrmsr(MSR_K8_SYSCFG, lo - | K8_MTRRFIXRANGE_DRAM_ENABLE - | K8_MTRRFIXRANGE_DRAM_MODIFY, hi); -} - /** * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have - * - * If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -335,10 +346,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) rdmsr(msr, lo, hi); if (lo != msrwords[0] || hi != msrwords[1]) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) && - ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) - k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); *changed = true; } @@ -426,6 +433,8 @@ static int set_fixed_ranges(mtrr_type * frs) bool changed = false; int block=-1, range; + k8_check_syscfg_dram_mod_en(); + while (fixed_range_blocks[++block].ranges) for (range=0; range < fixed_range_blocks[block].ranges; range++) set_fixed_range(fixed_range_blocks[block].base_msr + range, -- cgit v1.2.3 From 5a8ac9d28dae5330c70562c7d7785f5104059c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9rico=20Wang?= Date: Fri, 13 Mar 2009 15:56:58 +0800 Subject: x86: ptrace, bts: fix an unreachable statement Commit c2724775ce57c98b8af9694857b941dc61056516 put a statement after return, which makes that statement unreachable. Move that statement before return. Signed-off-by: WANG Cong Cc: Roland McGrath Cc: Markus Metzger LKML-Reference: <20090313075622.GB8933@hack> Cc: # .29 only Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3d9672e59c1..19378715f41 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child, if (!cfg.signal) return -EINVAL; - return -EOPNOTSUPP; - child->thread.bts_ovfl_signal = cfg.signal; + return -EOPNOTSUPP; } if ((cfg.flags & PTRACE_BTS_O_ALLOC) && -- cgit v1.2.3 From 9766cdbcb260389669e9679b2aa87c11832f479f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 14 Mar 2009 11:19:49 +0530 Subject: x86: cpu/common.c cleanups - fix various style problems - declare varibles before they get used - introduced clear_all_debug_regs - fix header files issues LKML-Reference: <1237009789.4387.2.camel@localhost.localdomain> Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 129 ++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 64 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c87627..cad6878c88d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,52 +1,52 @@ -#include -#include -#include -#include +#include #include +#include #include +#include #include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include -#include -#include -#include +#include + +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include +#include #include -#include -#include #include -#include -#include -#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include "cpu.h" #ifdef CONFIG_X86_64 /* all of these masks are initialized in setup_cpu_local_masks() */ -cpumask_var_t cpu_callin_mask; -cpumask_var_t cpu_callout_mask; cpumask_var_t cpu_initialized_mask; +cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_callin_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; @@ -62,10 +62,10 @@ void __init setup_cpu_local_masks(void) #else /* CONFIG_X86_32 */ -cpumask_t cpu_callin_map; +cpumask_t cpu_sibling_setup_map; cpumask_t cpu_callout_map; cpumask_t cpu_initialized; -cpumask_t cpu_sibling_setup_map; +cpumask_t cpu_callin_map; #endif /* CONFIG_X86_32 */ @@ -79,7 +79,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout * - * The TLS descriptors are currently at a different place compared to i386. + * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, @@ -243,6 +243,7 @@ cpuid_dependent_features[] = { static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) { const struct cpuid_dependent_feature *df; + for (df = cpuid_dependent_features; df->feature; df++) { /* * Note: cpuid_level is set to -1 if unavailable, but @@ -364,12 +365,12 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) undo that brain damage */ p = q = &c->x86_model_id[0]; while (*p == ' ') - p++; + p++; if (p != q) { - while (*p) - *q++ = *p++; - while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ } } @@ -441,14 +442,15 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } else if (smp_num_siblings > 1) { if (smp_num_siblings > nr_cpu_ids) { - printk(KERN_WARNING "CPU: Unsupported number of siblings %d", - smp_num_siblings); + pr_warning("CPU: Unsupported number of siblings %d", + smp_num_siblings); smp_num_siblings = 1; return; } index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, + index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -491,7 +493,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!printed) { printed++; - printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); + printk(KERN_ERR "CPU: vendor_id '%s'" + "unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } @@ -637,7 +640,7 @@ void __init early_cpu_init(void) struct cpu_dev **cdev; int count = 0; - printk("KERNEL supported cpus:\n"); + printk(KERN_INFO "KERNEL supported cpus:\n"); for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { struct cpu_dev *cpudev = *cdev; unsigned int j; @@ -650,7 +653,7 @@ void __init early_cpu_init(void) for (j = 0; j < 2; j++) { if (!cpudev->c_ident[j]) continue; - printk(" %s %s\n", cpudev->c_vendor, + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, cpudev->c_ident[j]); } } @@ -952,8 +955,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) __aligned(PAGE_SIZE); -extern asmlinkage void ignore_sysret(void); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -999,6 +1000,22 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) } #endif /* x86_64 */ +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1098,17 +1115,7 @@ void __cpuinit cpu_init(void) arch_kgdb_ops.correct_hw_break(); else #endif - { - /* - * Clear all 6 debug registers: - */ - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); - } + clear_all_debug_regs(); fpu_init(); @@ -1129,7 +1136,8 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); + for (;;) + local_irq_enable(); } printk(KERN_INFO "Initializing CPU#%d\n", cpu); @@ -1159,13 +1167,7 @@ void __cpuinit cpu_init(void) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear all 6 debug registers: */ - set_debugreg(0, 0); - set_debugreg(0, 1); - set_debugreg(0, 2); - set_debugreg(0, 3); - set_debugreg(0, 6); - set_debugreg(0, 7); + clear_all_debug_regs(); /* * Force FPU initialization: @@ -1186,5 +1188,4 @@ void __cpuinit cpu_init(void) xsave_init(); } - #endif -- cgit v1.2.3 From 88200bc28da38bcda1cb1bd218216e83b426d8a8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 14 Mar 2009 12:08:13 +0530 Subject: x86: entry_32.S fix compile warnings - fix work mask bit width Fix: arch/x86/kernel/entry_32.S:446: Warning: 00000000080001d1 shortened to 00000000000001d1 arch/x86/kernel/entry_32.S:457: Warning: 000000000800feff shortened to 000000000000feff arch/x86/kernel/entry_32.S:527: Warning: 00000000080001d1 shortened to 00000000000001d1 arch/x86/kernel/entry_32.S:541: Warning: 000000000800feff shortened to 000000000000feff arch/x86/kernel/entry_32.S:676: Warning: 0000000008000091 shortened to 0000000000000091 TIF_SYSCALL_FTRACE is 0x08000000 and until now we checked the first 16 bits of the work mask - bit 27 falls outside of that. Update the entry_32.S code to check the full 32-bit mask. [ %cx => %ecx fix from Cyrill Gorcunov ] Signed-off-by: Jaswinder Singh Rajput Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: "H. Peter Anvin" LKML-Reference: <1237012693.18733.3.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 899e8938e79..c929add475c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -442,8 +442,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: cmpl $(nr_syscalls), %eax @@ -454,7 +453,7 @@ sysenter_do_call: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx + testl $_TIF_ALLWORK_MASK, %ecx jne sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ @@ -468,7 +467,7 @@ sysenter_exit: #ifdef CONFIG_AUDITSYSCALL sysenter_audit: - testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 @@ -485,7 +484,7 @@ sysenter_audit: jmp sysenter_do_call sysexit_audit: - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) @@ -498,7 +497,7 @@ sysexit_audit: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit @@ -523,8 +522,7 @@ ENTRY(system_call) SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -538,7 +536,7 @@ syscall_exit: # between sampling and the iret TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx # current->work + testl $_TIF_ALLWORK_MASK, %ecx # current->work jne syscall_exit_work restore_all: @@ -673,7 +671,7 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $_TIF_WORK_SYSCALL_EXIT, %cl + testl $_TIF_WORK_SYSCALL_EXIT, %ecx jz work_pending TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call -- cgit v1.2.3 From 0f3fa48a7eaf5d1118cfda1650e8c759b2a116e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Mar 2009 08:46:17 +0100 Subject: x86: cpu/common.c more cleanups Complete/fix the cleanups of cpu/common.c: - fix ugly warning due to asm/topology.h -> linux/topology.h change - standardize the style across the file - simplify/refactor the code flow where possible Cc: Jaswinder Singh Rajput LKML-Reference: <1237009789.4387.2.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 243 +++++++++++++++++++++++++------------------ 1 file changed, 143 insertions(+), 100 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cad6878c88d..a9e3791ca09 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,4 +1,3 @@ -#include #include #include #include @@ -18,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -82,45 +82,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT #endif } }; @@ -164,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag) * the CPUID. Add "volatile" to not allow gcc to * optimize the subsequent calls to this function. */ - asm volatile ("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + : "=&r" (f1), "=&r" (f2) : "ir" (flag)); @@ -188,18 +189,22 @@ static int __cpuinit have_cpuid_p(void) static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } + unsigned long lo, hi; + + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); } static int __init x86_serial_nr_setup(char *s) @@ -232,6 +237,7 @@ struct cpuid_dependent_feature { u32 feature; u32 level; }; + static const struct cpuid_dependent_feature __cpuinitconst cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, @@ -245,6 +251,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) const struct cpuid_dependent_feature *df; for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; /* * Note: cpuid_level is set to -1 if unavailable, but * extended_extended_level is set to 0 if unavailable @@ -252,26 +261,26 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) * when signed; hence the weird messing around with * signs here... */ - if (cpu_has(c, df->feature) && - ((s32)df->level < 0 ? + if (!((s32)df->level < 0 ? (u32)df->level > (u32)c->extended_cpuid_level : - (s32)df->level > (s32)c->cpuid_level)) { - clear_cpu_cap(c, df->feature); - if (warn) - printk(KERN_WARNING - "CPU: CPU feature %s disabled " - "due to lack of CPUID level 0x%x\n", - x86_cap_flags[df->feature], - df->level); - } + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + printk(KERN_WARNING + "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } /* * Naming convention should be: [()] * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used */ /* Look up CPU names by table lookup. */ @@ -308,8 +317,10 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ +/* + * Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. + */ void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; @@ -355,14 +366,16 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) if (c->extended_cpuid_level < 0x80000004) return; - v = (unsigned int *) c->x86_model_id; + v = (unsigned int *)c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; - /* Intel chips right-justify this string for some dumb reason; - undo that brain damage */ + /* + * Intel chips right-justify this string for some dumb reason; + * undo that brain damage: + */ p = q = &c->x86_model_id[0]; while (*p == ' ') p++; @@ -439,28 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { + goto out; + } - if (smp_num_siblings > nr_cpu_ids) { - pr_warning("CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } + if (smp_num_siblings <= 1) + goto out; - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - index_msb); + if (smp_num_siblings > nr_cpu_ids) { + pr_warning("CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } - smp_num_siblings = smp_num_siblings / c->x86_max_cores; + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); - index_msb = get_count_order(smp_num_siblings); + smp_num_siblings = smp_num_siblings / c->x86_max_cores; - core_bits = get_count_order(c->x86_max_cores); + index_msb = get_count_order(smp_num_siblings); - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); - } + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); out: if ((c->x86_max_cores * smp_num_siblings) > 1) { @@ -475,8 +490,8 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - int i; static int printed; + int i; for (i = 0; i < X86_VENDOR_NUM; i++) { if (!cpu_devs[i]) @@ -485,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!strcmp(v, cpu_devs[i]->c_ident[0]) || (cpu_devs[i]->c_ident[1] && !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; c->x86_vendor = this_cpu->c_x86_vendor; return; @@ -493,8 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!printed) { printed++; - printk(KERN_ERR "CPU: vendor_id '%s'" - "unknown, using generic init.\n", v); + printk(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n", v); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); } @@ -514,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xf) << 4; + if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; c->x86_cache_alignment = c->x86_clflush_size; @@ -537,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; @@ -545,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { if (xlvl >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); @@ -762,8 +784,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) squash_the_stupid_serial_number(c); /* - * The vendor-specific functions might have changed features. Now - * we do "generic changes." + * The vendor-specific functions might have changed features. + * Now we do "generic changes." */ /* Filter out anything that depends on CPUID levels we don't have */ @@ -846,8 +868,8 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) } struct msr_range { - unsigned min; - unsigned max; + unsigned min; + unsigned max; }; static struct msr_range msr_range_array[] __cpuinitdata = { @@ -859,14 +881,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = { static void __cpuinit print_cpu_msr(void) { + unsigned index_min, index_max; unsigned index; u64 val; int i; - unsigned index_min, index_max; for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { index_min = msr_range_array[i].min; index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { if (rdmsrl_amd_safe(index, &val)) continue; @@ -876,6 +899,7 @@ static void __cpuinit print_cpu_msr(void) } static int show_msr __cpuinitdata; + static __init int setup_show_msr(char *arg) { int num; @@ -899,10 +923,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; - if (c->x86_vendor < X86_VENDOR_NUM) + if (c->x86_vendor < X86_VENDOR_NUM) { vendor = this_cpu->c_vendor; - else if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } if (vendor && !strstr(c->x86_model_id, vendor)) printk(KERN_CONT "%s ", vendor); @@ -929,10 +955,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static __init int setup_disablecpuid(char *arg) { int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) setup_clear_cpu_cap(bit); else return 0; + return 1; } __setup("clearcpuid=", setup_disablecpuid); @@ -942,6 +970,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); + DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; @@ -951,6 +980,17 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(unsigned int, irq_count) = -1; +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) __aligned(PAGE_SIZE); @@ -984,7 +1024,7 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); -#else /* x86_64 */ +#else /* CONFIG_X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, stack_canary); @@ -996,9 +1036,10 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; regs->gs = __KERNEL_STACK_CANARY; + return regs; } -#endif /* x86_64 */ +#endif /* CONFIG_X86_64 */ /* * Clear all 6 debug registers: @@ -1024,15 +1065,20 @@ static void clear_all_debug_regs(void) * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 + void __cpuinit cpu_init(void) { - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; + struct orig_ist *orig_ist; struct task_struct *me; + struct tss_struct *t; + unsigned long v; + int cpu; int i; + cpu = stack_smp_processor_id(); + t = &per_cpu(init_tss, cpu); + orig_ist = &per_cpu(orig_ist, cpu); + #ifdef CONFIG_NUMA if (cpu != 0 && percpu_read(node_number) == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) @@ -1073,19 +1119,17 @@ void __cpuinit cpu_init(void) * set up and load the per-CPU TSS */ if (!orig_ist->ist[0]) { - static const unsigned int sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ - }; char *estacks = per_cpu(exception_stacks, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += sizes[v]; + estacks += exception_stack_sizes[v]; orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. @@ -1187,5 +1231,4 @@ void __cpuinit cpu_init(void) xsave_init(); } - #endif -- cgit v1.2.3 From 78a8b35bc7abf8b8333d6f625e08c0f7cc1c3742 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 22:36:01 -0700 Subject: x86: make e820_update_range() handle small range update Impact: enhance e820 code to handle more cases Try to handle new range which could be covered by one entry. Signed-off-by: Yinghai Lu Cc: jbeulich@novell.com LKML-Reference: <49B9F0C1.10402@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 95b81c18b6b..0c34ff49ff4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -427,6 +427,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { + u64 end; unsigned int i; u64 real_updated_size = 0; @@ -435,21 +436,35 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; + u64 ei_end; + if (ei->type != old_type) continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { ei->type = new_type; real_updated_size += ei->size; continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; -- cgit v1.2.3 From 63516ef6d6a8379ee5c32463efc6a75f9da8a309 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Mar 2009 12:46:07 -0700 Subject: x86: fix get_mtrr() warning about smp_processor_id() with CONFIG_PREEMPT=y Impact: fix debug warning Jaswinder noticed that there is a warning about smp_processor_id() in get_mtrr(). Fix it by wrapping the printout into a get/put_cpu() pair. Reported-by: Jaswinder Singh Rajput Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49BAB7FF.4030107@kernel.org> [ changed to get/put_cpu(), cleaned up surrounding code a it. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 950c434f793..641ee3507d0 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -381,11 +381,14 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; + int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ + cpu = get_cpu(); + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -393,15 +396,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *base = 0; *size = 0; *type = 0; - return; + goto out_put_cpu; } rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); - /* Work out the shifted address mask. */ + /* Work out the shifted address mask: */ tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; mask_lo = size_or_mask | tmp; - /* Expand tmp with high bits to all 1s*/ + + /* Expand tmp with high bits to all 1s: */ hi = fls(tmp); if (hi > 0) { tmp |= ~((1<<(hi - 1)) - 1); @@ -412,15 +416,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } } - /* This works correctly if size is a power of two, i.e. a - contiguous range. */ + /* + * This works correctly if size is a power of two, i.e. a + * contiguous range: + */ *size = -mask_lo; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n", - smp_processor_id(), reg, *base, *size, + cpu, reg, *base, *size, mtrr_attrib_to_str(*type & 0xff)); +out_put_cpu: + put_cpu(); } /** -- cgit v1.2.3 From d4c90e37a21154c1910b2646e9544bdd32d5bc3a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Mar 2009 14:08:49 -0700 Subject: x86: print the continous part of fixed mtrrs together Impact: print out fewer lines 1. print continuous range with same type together 2. change _INFO to _DEBUG Signed-off-by: Yinghai Lu LKML-Reference: <49BACB61.8000302@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 58 +++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 641ee3507d0..37f28fc7cf9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -208,13 +208,47 @@ void mtrr_save_fixed_ranges(void *info) get_fixed_ranges(mtrr_state.fixed_ranges); } -static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) +static unsigned __initdata last_fixed_start; +static unsigned __initdata last_fixed_end; +static mtrr_type __initdata last_fixed_type; + +static void __init print_fixed_last(void) +{ + if (!last_fixed_end) + return; + + printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + + last_fixed_end = 0; +} + +static void __init update_fixed_last(unsigned base, unsigned end, + mtrr_type type) +{ + last_fixed_start = base; + last_fixed_end = end; + last_fixed_type = type; +} + +static void __init print_fixed(unsigned base, unsigned step, + const mtrr_type *types) { unsigned i; - for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO " %05X-%05X %s\n", - base, base + step - 1, mtrr_attrib_to_str(*types)); + for (i = 0; i < 8; ++i, ++types, base += step) { + if (last_fixed_end == 0) { + update_fixed_last(base, base + step, *types); + continue; + } + if (last_fixed_end == base && last_fixed_type == *types) { + last_fixed_end = base + step; + continue; + } + /* new segments: gap or different type */ + print_fixed_last(); + update_fixed_last(base, base + step, *types); + } } static void prepare_set(void); @@ -225,22 +259,26 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + printk(KERN_DEBUG "MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + + /* tail */ + print_fixed_last(); } - printk(KERN_INFO "MTRR variable ranges %sabled:\n", + printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_INFO " %u base %0*X%05X000 mask %0*X%05X000 %s\n", + printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", i, high_width, mtrr_state.var_ranges[i].base_hi, @@ -250,10 +288,10 @@ static void __init print_mtrr_state(void) mtrr_state.var_ranges[i].mask_lo >> 12, mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_INFO " %u disabled\n", i); + printk(KERN_DEBUG " %u disabled\n", i); } if (mtrr_tom2) { - printk(KERN_INFO "TOM2: %016llx aka %lldM\n", + printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } } -- cgit v1.2.3 From 48f4c485c275e9550fa1a1191768689cc3ae0037 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 14 Mar 2009 12:24:02 +0100 Subject: x86/centaur: merge 32 & 64 bit version there should be no difference, except: * the 64bit variant now also initializes the padlock unit. * ->c_early_init() is executed again from ->c_init() * the 64bit fixups made into 32bit path. Signed-off-by: Sebastian Andrzej Siewior Cc: herbert@gondor.apana.org.au LKML-Reference: <1237029843-28076-2-git-send-email-sebastian@breakpoint.cc> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 3 +-- arch/x86/kernel/cpu/centaur.c | 34 +++++++++++++++++++++++++++------- arch/x86/kernel/cpu/centaur_64.c | 37 ------------------------------------- 3 files changed, 28 insertions(+), 46 deletions(-) delete mode 100644 arch/x86/kernel/cpu/centaur_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d4356f8b752..4e242f9a06e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,7 @@ obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o -obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o -obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o +obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 983e0830f0d..c95e831bb09 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,11 +1,11 @@ +#include #include #include -#include #include -#include #include #include +#include #include "cpu.h" @@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) */ c->x86_capability[5] = cpuid_edx(0xC0000001); } - +#ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ if (c->x86_model >= 6 && c->x86_model <= 9) { rdmsr(MSR_VIA_FCR, lo, hi); @@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) /* Before Nehemiah, the C3's had 3dNOW! */ if (c->x86_model >= 6 && c->x86_model < 9) set_cpu_cap(c, X86_FEATURE_3DNOW); +#endif + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } display_cacheinfo(c); } @@ -316,16 +321,25 @@ enum { static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { switch (c->x86) { +#ifdef CONFIG_X86_32 case 5: /* Emulate MTRRs using Centaur's MCR. */ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); break; +#endif + case 6: + if (c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + break; } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif } static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { - +#ifdef CONFIG_X86_32 char *name; u32 fcr_set = 0; u32 fcr_clr = 0; @@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); - +#endif + early_init_centaur(c); switch (c->x86) { +#ifdef CONFIG_X86_32 case 5: switch (c->x86_model) { case 4: @@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) } sprintf(c->x86_model_id, "WinChip %s", name); break; - +#endif case 6: init_c3(c); break; } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif } static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { +#ifdef CONFIG_X86_32 /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) size >>= 8; @@ -464,7 +484,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) if ((c->x86 == 6) && (c->x86_model == 9) && (c->x86_mask == 1) && (size == 65)) size -= 1; - +#endif return size; } diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c deleted file mode 100644 index 51b09c48c9c..00000000000 --- a/arch/x86/kernel/cpu/centaur_64.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -#include -#include - -#include "cpu.h" - -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x6 && c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -} - -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) -{ - early_init_centaur(c); - - if (c->x86 == 0x6 && c->x86_model >= 0xf) { - c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -} - -static const struct cpu_dev centaur_cpu_dev __cpuinitconst = { - .c_vendor = "Centaur", - .c_ident = { "CentaurHauls" }, - .c_early_init = early_init_centaur, - .c_init = init_centaur, - .c_x86_vendor = X86_VENDOR_CENTAUR, -}; - -cpu_dev_register(centaur_cpu_dev); - -- cgit v1.2.3 From f4c3c4cdb1de232ff37cf4339eb2f36c84e20da6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 13 Mar 2009 19:59:26 +0530 Subject: x86: cpu_debug add support for various AMD CPUs Impact: Added AMD CPUs support Added flags for various AMD CPUs. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu_debug.c | 150 ++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 21c0cf8ced1..46e29ab96c6 100755 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -64,6 +64,8 @@ static struct cpu_debug_base cpu_base[] = { { "vmx", CPU_VMX, 0 }, { "call", CPU_CALL, 0 }, { "base", CPU_BASE, 0 }, + { "ver", CPU_VER, 0 }, + { "conf", CPU_CONF, 0 }, { "smm", CPU_SMM, 0 }, { "svm", CPU_SVM, 0 }, { "osvm", CPU_OSVM, 0 }, @@ -177,54 +179,59 @@ static struct cpu_debug_range cpu_intel_range[] = { /* AMD Registers Range */ static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, }, - - { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, }, - - { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, }, - - { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, }, - { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, }, - { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, }, - { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, }, - { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, }, - { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, }, - { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, }, - { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, }, - { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, }, - { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, }, - { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, }, - { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, }, - { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, }, + { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, + { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, + { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, + { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, + { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, + { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, + + { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, + { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, + { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, + { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, + + { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, + { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, + { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, + { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, + { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, + { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, + { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, + { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, + { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, + { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, + { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, + { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, + { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, + { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, + { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, + { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, }; -static int get_cpu_modelflag(unsigned cpu) +/* Intel */ +static int get_intel_modelflag(unsigned model) { int flag; - switch (per_cpu(cpu_model, cpu)) { - /* Intel */ + switch (model) { case 0x0501: case 0x0502: case 0x0504: @@ -271,6 +278,59 @@ static int get_cpu_modelflag(unsigned cpu) return flag; } +/* AMD */ +static int get_amd_modelflag(unsigned model) +{ + int flag; + + switch (model >> 8) { + case 0x6: + flag = CPU_AMD_K6; + break; + case 0x7: + flag = CPU_AMD_K7; + break; + case 0x8: + flag = CPU_AMD_K8; + break; + case 0xf: + flag = CPU_AMD_0F; + break; + case 0x10: + flag = CPU_AMD_10; + break; + case 0x11: + flag = CPU_AMD_11; + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + +static int get_cpu_modelflag(unsigned cpu) +{ + int flag; + + flag = per_cpu(cpu_model, cpu); + + switch (flag >> 16) { + case X86_VENDOR_INTEL: + flag = get_intel_modelflag(flag); + break; + case X86_VENDOR_AMD: + flag = get_amd_modelflag(flag & 0xffff); + break; + default: + flag = CPU_NONE; + break; + } + + return flag; +} + static int get_cpu_range_count(unsigned cpu) { int index; @@ -311,7 +371,8 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) return 1; break; case X86_VENDOR_AMD: - if (cpu_amd_range[i].flag & flag) + if ((cpu_amd_range[i].model & modelflag) && + (cpu_amd_range[i].flag & flag)) return 1; break; } @@ -337,7 +398,8 @@ static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, } break; case X86_VENDOR_AMD: - if (cpu_amd_range[index].flag & flag) { + if ((cpu_amd_range[index].model & modelflag) && + (cpu_amd_range[index].flag & flag)) { *min = cpu_amd_range[index].min; *max = cpu_amd_range[index].max; } -- cgit v1.2.3 From b9719a4d9cfa5d46fa6a1eb3e3fc2238e7981e4d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 11:19:18 -0700 Subject: x86: make section delimiter symbols part of their section Impact: cleanup Move the symbols delimiting a section part of the section (section relative) rather than absolute. This avoids any unexpected gaps between the section-start symbol and the first data in the section, which could be caused by implicit alignment of the section data. It also makes the general form of vmlinux_64.lds.S consistent with vmlinux_32.lds.S. Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: "Eric W. Biederman" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux_64.lds.S | 85 +++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 5bf54e40c6e..fe5d21ce724 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -29,8 +29,8 @@ SECTIONS { . = __START_KERNEL; phys_startup_64 = startup_64 - LOAD_OFFSET; - _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ /* First the code that has to be first for bootstrapping */ *(.text.head) _stext = .; @@ -61,13 +61,13 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA CONSTRUCTORS + _edata = .; /* End of data section */ } :data - _edata = .; /* End of data section */ - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); *(.data.cacheline_aligned) } . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); @@ -125,29 +125,29 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); /* init_task */ *(.data.init_task) }:data.init - . = ALIGN(PAGE_SIZE); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); *(.data.page_aligned) } - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; } - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; + __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT @@ -159,40 +159,42 @@ SECTIONS __initdata_end = .; } - . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; INITCALLS + __initcall_end = .; } - __initcall_end = .; - __con_initcall_start = .; .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; *(.con_initcall.init) + __con_initcall_end = .; } - __con_initcall_end = .; - __x86_cpu_dev_start = .; .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; } - __x86_cpu_dev_end = .; SECURITY_INIT . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; + __parainstructions = .; *(.parainstructions) - __parainstructions_end = .; + __parainstructions_end = .; } - . = ALIGN(8); - __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; *(.altinstructions) + __alt_instructions_end = .; } - __alt_instructions_end = .; .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } @@ -207,9 +209,11 @@ SECTIONS #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(PAGE_SIZE); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP @@ -229,20 +233,21 @@ SECTIONS . = ALIGN(PAGE_SIZE); __init_end = .; - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - __bss_start = .; /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ *(.bss.page_aligned) *(.bss) - } - __bss_stop = .; + __bss_stop = .; + } _end = . ; -- cgit v1.2.3 From 93dbda7cbcd70a0bd1a99f39f44a9ccde8ab9040 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Feb 2009 17:35:44 -0800 Subject: x86: add brk allocation for very, very early allocations Impact: new interface Add a brk()-like allocator which effectively extends the bss in order to allow very early code to do dynamic allocations. This is better than using statically allocated arrays for data in subsystems which may never get used. The space for brk allocations is in the bss ELF segment, so that the space is mapped properly by the code which maps the kernel, and so that bootloaders keep the space free rather than putting a ramdisk or something into it. The bss itself, delimited by __bss_stop, ends before the brk area (__brk_base to __brk_limit). The kernel text, data and bss is reserved up to __bss_stop. Any brk-allocated data is reserved separately just before the kernel pagetable is built, as that code allocates from unreserved spaces in the e820 map, potentially allocating from any unused brk memory. Ultimately any unused memory in the brk area is used in the general kernel memory pool. Initially the brk space is set to 1MB, which is probably much larger than any user needs (the largest current user is i386 head_32.S's code to build the pagetables to map the kernel, which can get fairly large with a big kernel image and no PSE support). So long as the system has sufficient memory for the bootloader to reserve the kernel+1MB brk, there are no bad effects resulting from an over-large brk. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/setup.c | 39 ++++++++++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 7 +++++++ arch/x86/kernel/vmlinux_64.lds.S | 5 +++++ 5 files changed, 48 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index ac108d1fe18..29f1095b084 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,7 +18,7 @@ void __init i386_start_kernel(void) { reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f5b27224769..70eaa852c73 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data) reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f28c56e6bf9..e6b742d2a3f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,6 +114,9 @@ unsigned int boot_cpu_id __read_mostly; +static __initdata unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + #ifdef CONFIG_X86_64 int default_cpu_present_to_apicid(int mps_cpu) { @@ -337,6 +340,34 @@ static void __init relocate_initrd(void) } #endif +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + + /* Mark brk area as locked down and no longer taking any new allocations */ + _brk_start = 0; +} + static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; @@ -717,11 +748,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; -#ifdef CONFIG_X86_32 - init_mm.brk = init_pg_tables_end + PAGE_OFFSET; -#else - init_mm.brk = (unsigned long) &_end; -#endif + init_mm.brk = _brk_end; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; @@ -842,6 +869,8 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + reserve_brk(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Sat, 14 Mar 2009 17:19:51 -0700 Subject: x86: move brk initialization out of #ifdef CONFIG_BLK_DEV_INITRD Impact: build fix The brk initialization functions were incorrectly located inside an #ifdef CONFIG_VLK_DEV_INITRD block, causing the obvious build failure in minimal configurations. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge --- arch/x86/kernel/setup.c | 57 +++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e6b742d2a3f..b8329029c15 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -272,6 +272,35 @@ static inline void copy_edd(void) } #endif +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + #ifdef CONFIG_BLK_DEV_INITRD #ifdef CONFIG_X86_32 @@ -340,34 +369,6 @@ static void __init relocate_initrd(void) } #endif -void * __init extend_brk(size_t size, size_t align) -{ - size_t mask = align - 1; - void *ret; - - BUG_ON(_brk_start == 0); - BUG_ON(align & mask); - - _brk_end = (_brk_end + mask) & ~mask; - BUG_ON((char *)(_brk_end + size) > __brk_limit); - - ret = (void *)_brk_end; - _brk_end += size; - - memset(ret, 0, size); - - return ret; -} - -static void __init reserve_brk(void) -{ - if (_brk_end > _brk_start) - reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); - - /* Mark brk area as locked down and no longer taking any new allocations */ - _brk_start = 0; -} - static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; -- cgit v1.2.3 From ccf3fe02e35f4abca2589f99022cc25084bbd8ae Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 13:27:38 -0800 Subject: x86-32: use brk segment for allocating initial kernel pagetable Impact: use new interface instead of previous ad hoc implementation Rather than having special purpose init_pg_table_start/end variables to delimit the kernel pagetable built by head_32.S, just use the brk mechanism to extend the bss for the new pagetable. This patch removes init_pg_table_start/end and pg0, defines __brk_base (which is page-aligned and immediately follows _end), initializes the brk region to start there, and uses it for the 32-bit pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head32.c | 3 --- arch/x86/kernel/head_32.S | 14 +++++++------- arch/x86/kernel/setup.c | 6 ------ arch/x86/kernel/vmlinux_32.lds.S | 4 ---- 4 files changed, 7 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 29f1095b084..3f8579f8d42 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,9 +29,6 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_early(init_pg_tables_start, init_pg_tables_end, - "INIT_PG_TABLE"); - reserve_ebda_region(); /* diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c32ca19d591..db6652710e9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -167,7 +167,7 @@ num_subarch_entries = (. - subarch_entries) / 4 /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond _end. The variable - * init_pg_tables_end is set up to point to the first "safe" location. + * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. * @@ -190,8 +190,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ - movl $pa(pg0), %edi - movl %edi, pa(init_pg_tables_start) + movl $pa(__brk_base), %edi movl $pa(swapper_pg_pmd), %edx movl $PTE_IDENT_ATTR, %eax 10: @@ -216,7 +215,8 @@ default_entry: cmpl %ebp,%eax jb 10b 1: - movl %edi,pa(init_pg_tables_end) + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) @@ -227,8 +227,7 @@ default_entry: page_pde_offset = (__PAGE_OFFSET >> 20); - movl $pa(pg0), %edi - movl %edi, pa(init_pg_tables_start) + movl $pa(__brk_base), %edi movl $pa(swapper_pg_dir), %edx movl $PTE_IDENT_ATTR, %eax 10: @@ -249,7 +248,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b - movl %edi,pa(init_pg_tables_end) + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) shrl $12, %eax movl %eax, pa(max_pfn_mapped) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b8329029c15..3ac2aa7b9ea 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -161,12 +161,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_start __initdata = ~0UL; -unsigned long init_pg_tables_end __initdata = ~0UL; - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 27e44aa2158..1063fbe93c7 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -196,10 +196,6 @@ SECTIONS __brk_limit = . ; _end = . ; - - /* This is where the kernel creates the early boot page tables */ - . = ALIGN(PAGE_SIZE); - pg0 = . ; } /* Sections to be discarded */ -- cgit v1.2.3 From 6de6cb442e76bbaf2e685150be8ddac0f237a59c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 13:35:45 -0800 Subject: x86: use brk allocation for DMI Impact: use new interface instead of previous ad hoc implementation Use extend_brk() to allocate memory for DMI rather than having an ad-hoc allocator. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3ac2aa7b9ea..e894f36335f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -215,12 +215,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; -/* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - /* * Setup options */ -- cgit v1.2.3 From 7543c1de84ed93c6769c9f20dced08a522af8912 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 12 Mar 2009 16:04:42 -0700 Subject: x86-32: compute initial mapping size more accurately Impact: simplification We only need to map the kernel in head_32.S, not the whole of lowmem. We use 512MB as a reasonable (but arbitrary) limit on the maximum size of the kernel image. Signed-off-by: Yinghai Lu Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index db6652710e9..3ce5456dfbe 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -54,7 +54,7 @@ * * This should be a multiple of a page. */ -LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT /* * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate -- cgit v1.2.3 From 796216a57fe45c04adc35bda1f0782efec78a713 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Mar 2009 16:09:49 -0700 Subject: x86: allow extend_brk users to reserve brk space Impact: new interface; remove hard-coded limit Add RESERVE_BRK(name, size) macro to reserve space in the brk area. This should be a conservative (ie, larger) estimate of how much space might possibly be required from the brk area. Any unused space will be freed, so there's no real downside on making the reservation too large (within limits). The name should be unique within a given file, and somewhat descriptive. The C definition of RESERVE_BRK() ends up being more complex than one would expect to work around a cluster of gcc infelicities: The first attempt was to simply try putting __section(.brk_reservation) on a variable. This doesn't work because it ends up making it a @progbits section, which gets actual space allocated in the vmlinux executable. The second attempt was to emit the space into a section using asm, but gcc doesn't allow arguments to be passed to file-level asm() statements, making it hard to pass in the size. The final attempt is to wrap the asm() in a function to allow it to have arguments, and put the function itself into the .discard section, which vmlinux*.lds drops entirely from the emitted vmlinux. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 2 ++ arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/vmlinux_32.lds.S | 4 +++- arch/x86/kernel/vmlinux_64.lds.S | 4 +++- 4 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3ce5456dfbe..9e89f2a14b9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -75,6 +75,8 @@ ALLOCATOR_SLOP = 4 INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm +RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE) + /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, * %esi points to the real-mode code as a 32-bit pointer. diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e894f36335f..a0d26237d7c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,8 @@ #define ARCH_SETUP #endif +RESERVE_BRK(dmi_alloc, 65536); + unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 1063fbe93c7..a1f28b85fb3 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -192,7 +192,8 @@ SECTIONS . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 1024 * 1024 ; + . += 64 * 1024 ; /* 64k slop space */ + *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; _end = . ; @@ -201,6 +202,7 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) } STABS_DEBUG diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index ff373423138..7996687663a 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -250,7 +250,8 @@ SECTIONS . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 1024 * 1024 ; + . += 64 * 1024; /* 64k slop space */ + *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; } @@ -260,6 +261,7 @@ SECTIONS /DISCARD/ : { *(.exitcall.exit) *(.eh_frame) + *(.discard) } STABS_DEBUG -- cgit v1.2.3 From 2bd2753ff46346543ab92e80df9d96366e21baa5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 9 Mar 2009 01:15:57 -0700 Subject: x86: put initial_pg_tables into .bss Impact: makes vmlinux section information more useful Don't use ram after _end blindly for pagetables. aka init pages is before _end put those pg table into .bss [Adapted to use brk segment - Jeremy] v2: keep initial page table up to 512M only. v4: put initial page tables just before _end Signed-off-by: Yinghai Lu Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 43 +++++++++++++--------------------------- arch/x86/kernel/vmlinux_32.lds.S | 6 ++++++ 2 files changed, 20 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9e89f2a14b9..c79741cfb07 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -41,41 +41,28 @@ * This is how much memory *in addition to the memory covered up to * and including _end* we need mapped initially. * We need: - * - one bit for each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) - * - enough space to map all low memory, which means - * (2^32/4096) / 1024 pages (worst case, non PAE) - * (2^32/4096) / 512 + 4 pages (worst case for PAE) - * - a few pages for allocator use before the kernel pagetable has - * been set up + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries */ LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT -/* - * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate - * pagetables from above the 16MB DMA limit, so we'll have to set - * up pagetables 16MB more (worst-case): - */ -#ifdef CONFIG_DEBUG_PAGEALLOC -LOW_PAGES = LOW_PAGES + 0x1000000 -#endif - #if PTRS_PER_PMD > 1 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD #else PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) #endif -BOOTBITMAP_SIZE = LOW_PAGES / 8 ALLOCATOR_SLOP = 4 -INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm - -RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE) +INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm +RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, @@ -168,10 +155,10 @@ num_subarch_entries = (. - subarch_entries) / 4 /* * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond _end. The variable + * tables, which are located immediately beyond __brk_base. The variable * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. + * and PAGE_OFFSET for up to _end. * * Note that the stack is not yet set up! */ @@ -210,10 +197,9 @@ default_entry: loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables. + * End condition: we must map up to the end. */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end) + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b 1: @@ -243,11 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); addl $0x1000,%eax loop 11b /* - * End condition: we must map up to and including INIT_MAP_BEYOND_END - * bytes beyond the end of our own page tables; the +0x007 is - * the attribute bits + * End condition: we must map up to end */ - leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp + movl $pa(_end) + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b addl $__PAGE_OFFSET, %edi @@ -638,6 +622,7 @@ swapper_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 + /* * This starts the data section. */ diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a1f28b85fb3..98424f33e07 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -210,6 +210,12 @@ SECTIONS DWARF_DEBUG } +/* + * Build-time check on the image size: + */ +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + #ifdef CONFIG_KEXEC /* Link time checks */ #include -- cgit v1.2.3 From 6d7942dc2a70a7e74c352107b150265602671588 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Mar 2009 14:32:41 -0700 Subject: x86: fix 64k corruption-check Impact: fix boot crash Need to exit early if the addr is far above 64k. The crash got exposed by: 78a8b35: x86: make e820_update_range() handle small range update Signed-off-by: Yinghai Lu Cc: LKML-Reference: <49BC2279.2030101@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index b617b1164f1..fc999e6fc46 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -86,12 +86,12 @@ void __init setup_bios_corruption_check(void) if (!(addr + 1)) break; + if (addr >= corruption_check_size) + break; + if ((addr + size) > corruption_check_size) size = corruption_check_size - addr; - if (size == 0) - break; - e820_update_range(addr, size, E820_RAM, E820_RESERVED); scan_areas[num_scan_areas].addr = addr; scan_areas[num_scan_areas].size = size; -- cgit v1.2.3 From c61cf4cfe7c73c7aa62dde3ff82cd475b9c41481 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Mar 2009 00:59:19 -0700 Subject: x86: print out more info in e820_update_range() Impact: help debug e820 bugs Try to print out more info, to catch wrong call parameters. Signed-off-by: Yinghai Lu LKML-Reference: <49BCB557.3030000@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 56 +++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c34ff49ff4..fb638d9ce6d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -131,6 +131,31 @@ void __init e820_add_region(u64 start, u64 size, int type) __e820_add_region(&e820, start, size, type); } +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "(usable)"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "(unusable)"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } +} + void __init e820_print_map(char *who) { int i; @@ -140,27 +165,8 @@ void __init e820_print_map(char *who) (unsigned long long) e820.map[i].addr, (unsigned long long) (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - case E820_UNUSABLE: - printk("(unusable)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820.map[i].type); - break; - } + e820_print_type(e820.map[i].type); + printk(KERN_CONT "\n"); } } @@ -437,6 +443,14 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; + printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; -- cgit v1.2.3 From 514ec49a5f5146a6c0ade1a688787acf13617868 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 16 Mar 2009 17:07:33 +0900 Subject: x86, mce: remove incorrect __cpuinit for intel_init_cmci() Impact: Bug fix on UP Referring commit cc3ca22063784076bd240fda87217387a8f2ae92, Peter removed __cpuinit annotations for mce_cpu_features() and its successor functions, which caused troubles on UP configurations. However the intel_init_cmci() was introduced after that and it also has __cpuinit annotation even though it is called from mce_cpu_features(). Remove the annotation from that function too. Signed-off-by: Hidetoshi Seto Cc: H. Peter Anvin Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aaa7d973093..57df3d38347 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -270,7 +270,7 @@ void cmci_reenable(void) cmci_discover(banks, 0); } -static __cpuinit void intel_init_cmci(void) +static void intel_init_cmci(void) { int banks; -- cgit v1.2.3 From 250981e6e1ef04947eccaa55e8c25ddcaa47a02e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Mar 2009 13:07:21 +0100 Subject: x86: reduce preemption off section in exit thread Impact: latency improvement No need to keep preemption disabled over the kfree call. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6afa5232dbb..156f87582c6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -65,11 +65,11 @@ void exit_thread(void) { struct task_struct *me = current; struct thread_struct *t = &me->thread; + unsigned long *bp = t->io_bitmap_ptr; - if (me->thread.io_bitmap_ptr) { + if (bp) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); /* @@ -78,6 +78,7 @@ void exit_thread(void) memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); + kfree(bp); } ds_exit_thread(current); -- cgit v1.2.3 From f0348c438c9ea156194d31fadde4a9e75522196f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Mar 2009 16:33:59 -0700 Subject: x86: MTRR workaround for system with stange var MTRRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: don't trim e820 according to wrong mtrr Ozan reports that his server emits strange warning. it turns out the BIOS sets the MTRRs incorrectly. Ignore those strange ranges, and don't trim e820, just emit one warning about BIOS Reported-by: Ozan Çağlayan Signed-off-by: Yinghai Lu LKML-Reference: <49BEE1E7.7020706@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 11 +++++++++++ arch/x86/kernel/cpu/mtrr/main.c | 16 ++++++++-------- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 58b58bbf7eb..f4f89fbc228 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -189,6 +189,17 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (!size) continue; base = range_state[i].base_pfn; + if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && + (mtrr_state.enabled & 1)) { + /* Var MTRR contains UC entry below 1M? Skip it: */ + printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " + "contains strange UC entry under 1M, check " + "with your system vendor!\n", i); + if (base + size <= (1<<(20-PAGE_SHIFT))) + continue; + size -= (1<<(20-PAGE_SHIFT)) - base; + base = 1<<(20-PAGE_SHIFT); + } subtract_range(range, base, base + size - 1); } if (extra_remove_size) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5c2e266f41d..03cda01f57c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; +static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { @@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state) for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, - &mtrr_state[i].lbase, - &mtrr_state[i].lsize, - &mtrr_state[i].ltype); + &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } @@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev) int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_state[i].lsize) + if (mtrr_value[i].lsize) set_mtrr(i, - mtrr_state[i].lbase, - mtrr_state[i].lsize, - mtrr_state[i].ltype); + mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); } return 0; } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6710e93021a..77f67f7b347 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; extern u64 mtrr_tom2; +extern struct mtrr_state_type mtrr_state; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); -- cgit v1.2.3 From c090f532db3ab5b7be503f8ac84b0d33a18646c6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 12:07:54 -0700 Subject: x86-32: make sure we map enough to fit linear map pagetables Impact: crash fix head_32.S needs to map the kernel itself, and enough space so that mm/init.c can allocate space from the e820 allocator for the linear map of low memory. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c79741cfb07..d383a7c0e49 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -38,8 +38,8 @@ #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id /* - * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. + * This is how much memory in addition to the memory covered up to + * and including _end we need mapped initially. * We need: * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) @@ -52,16 +52,25 @@ * KERNEL_IMAGE_SIZE should be greater than pa(_end) * and small than max_low_pfn, otherwise will waste some page table entries */ -LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT #if PTRS_PER_PMD > 1 -PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) #else -PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif ALLOCATOR_SLOP = 4 -INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm +/* Enough space to fit pagetables for the low memory linear map */ +MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) + +/* + * Worst-case size of the kernel mapping we need to make: + * the worst-case size of the kernel itself, plus the extra we need + * to map for the linear map. + */ +KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT + +INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -197,9 +206,9 @@ default_entry: loop 11b /* - * End condition: we must map up to the end. + * End condition: we must map up to the end + MAPPING_BEYOND_END. */ - movl $pa(_end) + PTE_IDENT_ATTR, %ebp + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b 1: @@ -229,9 +238,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); addl $0x1000,%eax loop 11b /* - * End condition: we must map up to end + * End condition: we must map up to the end + MAPPING_BEYOND_END. */ - movl $pa(_end) + PTE_IDENT_ATTR, %ebp + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp cmpl %ebp,%eax jb 10b addl $__PAGE_OFFSET, %edi -- cgit v1.2.3 From b8a22a6273d5aed248db2695ce9bf65eb3e9fbe6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 12:10:07 -0700 Subject: x86-32: remove ALLOCATOR_SLOP from head_32.S Impact: cleanup ALLOCATOR_SLOP is a vestigial remain from when we used the bootmem allocator to allocate the kernel's linear memory mapping. Now we directly reserve pages from the e820 mapping, and no longer require secondary structures to keep track of allocated pages. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d383a7c0e49..fc884b90b6d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -58,7 +58,6 @@ #else #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif -ALLOCATOR_SLOP = 4 /* Enough space to fit pagetables for the low memory linear map */ MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) @@ -70,7 +69,7 @@ MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) */ KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT -INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* -- cgit v1.2.3 From 60ac98213914cc615c67e84bfb964aa95a080d13 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 17 Mar 2009 11:38:23 -0700 Subject: x86-32: tighten the bound on additional memory to map Impact: Tighten bound to avoid masking errors The definition of MAPPING_BEYOND_END was excessive; this has a nasty tendency to mask bugs. We have learned over time that this kind of bug hiding can cause some very strange errors. Therefore, tighten the bound to only need to map the actual kernel area. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Yinghai Lu --- arch/x86/kernel/head_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fc884b90b6d..30683883e0c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -60,7 +60,8 @@ #endif /* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE) +MAPPING_BEYOND_END = \ + PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT /* * Worst-case size of the kernel mapping we need to make: -- cgit v1.2.3 From 704439ddf9f8ff1fc8c6d8abaac4bc4c95697e39 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 14 Mar 2009 23:20:47 -0700 Subject: x86/brk: put the brk reservations in their own section Impact: disambiguate real .bss variables from .brk storage Add a .brk section after the .bss section. This has no effect on the final vmlinux, but it more clearly distinguishes the space taken by actual .bss symbols, and the variable space reserved by .brk users. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/vmlinux_32.lds.S | 8 +++++--- arch/x86/kernel/vmlinux_64.lds.S | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 98424f33e07..de14973e47f 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -189,16 +189,18 @@ SECTIONS *(.bss) . = ALIGN(4); __bss_stop = .; + } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 64 * 1024 ; /* 64k slop space */ + . += 64 * 1024 ; /* 64k alignment slop space */ *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; - - _end = . ; } + _end = . ; + /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7996687663a..c8742507b03 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -247,10 +247,12 @@ SECTIONS *(.bss.page_aligned) *(.bss) __bss_stop = .; + } + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __brk_base = . ; - . += 64 * 1024; /* 64k slop space */ + . += 64 * 1024 ; /* 64k alignment slop space */ *(.brk_reservation) /* areas brk users have reserved */ __brk_limit = . ; } -- cgit v1.2.3 From 0a699af8e613a670be50245366fa18cb19ac5172 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 17 Mar 2009 14:14:31 -0700 Subject: x86-32: move _end to a dummy section Impact: build fix with CONFIG_RELOCATABLE Move _end into a dummy section, so that relocs.c will know it is a relocatable symbol. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Jeremy Fitzhardinge --- arch/x86/kernel/vmlinux_32.lds.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index de14973e47f..62ad500d55f 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -199,7 +199,9 @@ SECTIONS __brk_limit = . ; } - _end = . ; + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; + } /* Sections to be discarded */ /DISCARD/ : { -- cgit v1.2.3 From 9d783ba042771284fb4ee5013c3d94220755ae7f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:04:55 -0700 Subject: x86, x2apic: enable fault handling for intr-remapping Impact: interface augmentation (not yet used) Enable fault handling flow for intr-remapping aswell. Fault handling code now shared by both dma-remapping and intr-remapping. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 9 +++++++-- arch/x86/kernel/apic/probe_64.c | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 00e6071cefc..b18a7734d68 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3294,7 +3294,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } else #endif { - msg->address_hi = MSI_ADDR_BASE_HI; + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO | ((apic->irq_dest_mode == 0) ? @@ -3528,7 +3533,7 @@ void arch_teardown_msi_irq(unsigned int irq) destroy_irq(irq); } -#ifdef CONFIG_DMAR +#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 8d7748efe6a..8297c2b8ed2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -68,6 +68,15 @@ void __init default_setup_apic_routing(void) apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } + +#ifdef CONFIG_X86_X2APIC + /* + * Now that apic routing model is selected, configure the + * fault handling for intr remapping. + */ + if (intr_remapping_enabled) + enable_drhd_fault_handling(); +#endif } /* Same for both flat and physical. */ -- cgit v1.2.3 From 7c6d9f9785d156d0438401ef270322da3d5247db Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:04:59 -0700 Subject: x86, x2apic: use virtual wire A mode in disable_IO_APIC() with interrupt-remapping Impact: make kexec work with x2apic disable_IO_APIC() gets called during crashdump aswell, which configures the IO-APIC/LAPIC so that legacy interrupts can be delivered for the kexec'd kernel. In the presence of interrupt-remapping, we need to change the interrupt-remapping configuration aswell as modifying IO-APIC for virtual wire B mode. To keep things simple during the crash, use virtual wire A mode (for which we don't need to touch io-apic and interrupt-remapping tables). Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b18a7734d68..4d975d0e358 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2040,8 +2040,13 @@ void disable_IO_APIC(void) * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. + * + * With interrupt-remapping, for now we will use virtual wire A mode, + * as virtual wire B is little complex (need to configure both + * IOAPIC RTE aswell as interrupt-remapping table entry). + * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1) { + if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2061,7 +2066,10 @@ void disable_IO_APIC(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); + /* + * Use virtual wire A mode when interrupt remapping is enabled. + */ + disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From cf6567fe40c55e9cffca7355cd34e50fb2871e4e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:00 -0700 Subject: x86, x2apic: fix clear_local_APIC() in the presence of x2apic Impact: cleanup, paranoia We were not clearing the local APIC in clear_local_APIC() in the presence of x2apic. Fix it. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 30909a258d0..699f8cf76bb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -809,7 +809,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!apic_phys) + if (!x2apic && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1523,12 +1523,10 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { -#ifdef CONFIG_X86_X2APIC if (x2apic) { boot_cpu_physical_apicid = read_apic_id(); return; } -#endif /* * If no local APIC can be found then set up a fake all @@ -1972,12 +1970,9 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_X2APIC if (x2apic) enable_x2apic(); - else -#endif - { + else { /* * Make sure the APICBASE points to the right address * -- cgit v1.2.3 From 0280f7c416c652a2fd95d166f52b199ae61122c0 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:01 -0700 Subject: x86, x2apic: cleanup the IO-APIC level migration with interrupt-remapping Impact: simplification In the current code, for level triggered migration, we need to modify the io-apic RTE with the update vector information, along with modifying interrupt remapping table entry(IRTE) with vector and destination. This is to ensure that remote IRR bit inthe IOAPIC RTE gets cleared when the cpu does EOI. With this patch, for level triggered, we eliminate the io-apic RTE modification (with the updated vector information), by using a virtual vector (io-apic pin number). Real vector that is used for interrupting cpu will be coming from the interrupt-remapping table entry. Trigger mode in the IRTE will always be edge, and the actual level or edge trigger will be setup in the IO-APIC RTE. So a level triggered interrupt will appear as an edge to the local apic cpu but still as level to the IO-APIC. With this change, level irq migration can be done by simply modifying the interrupt-remapping table entry with out changing the io-apic RTE. And as the interrupt appears as edge at the cpu, in addition to do the local apic EOI, we need to do IO-APIC directed EOI to clear the remote IRR bit in the IO-APIC RTE. This simplies the irq migration in the presence of interrupt-remapping. Idea-by: Rajesh Sankaran Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 156 +++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 91 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d975d0e358..e074eac5bd3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -389,6 +389,8 @@ struct io_apic { unsigned int index; unsigned int unused[3]; unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; }; static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) @@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); } +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} + static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -1478,7 +1486,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t int setup_ioapic_entry(int apic_id, int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int trigger, - int polarity, int vector) + int polarity, int vector, int pin) { /* * add it to the IO-APIC irq-routing table: @@ -1504,7 +1512,14 @@ int setup_ioapic_entry(int apic_id, int irq, irte.present = 1; irte.dst_mode = apic->irq_dest_mode; - irte.trigger_mode = trigger; + /* + * Trigger mode in the IRTE will always be edge, and the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments above explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte.trigger_mode = 0; irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = vector; irte.dest_id = IRTE_DEST(destination); @@ -1515,18 +1530,23 @@ int setup_ioapic_entry(int apic_id, int irq, ir_entry->zero = 0; ir_entry->format = 1; ir_entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + ir_entry->vector = pin; } else #endif { entry->delivery_mode = apic->irq_delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->dest = destination; + entry->vector = vector; } entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; entry->polarity = polarity; - entry->vector = vector; /* Mask level triggered irqs. * Use IRQ_DELAYED_DISABLE for edge triggered irqs. @@ -1561,7 +1581,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, - dest, trigger, polarity, cfg->vector)) { + dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic_id].apicid, pin); __clear_irq_vector(irq, cfg); @@ -2311,37 +2331,24 @@ static int ioapic_retrigger_irq(unsigned int irq) #ifdef CONFIG_SMP #ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); /* * Migrate the IO-APIC irq in the presence of intr-remapping. * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. */ static void migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; - int modify_ioapic_rte; unsigned int dest; - unsigned long flags; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) @@ -2359,13 +2366,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -2380,73 +2380,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) cpumask_copy(desc->affinity, mask); } -static int migrate_irq_remapped_level_desc(struct irq_desc *desc) -{ - int ret = -1; - struct irq_cfg *cfg = desc->chip_data; - - mask_IO_APIC_irq_desc(desc); - - if (io_apic_level_ack_pending(cfg)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpumask_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq_desc(desc); - - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - /* * Migrates the IRQ destination in the process context. */ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, mask); - migrate_irq_remapped_level_desc(desc); - return; - } - migrate_ioapic_irq_desc(desc, mask); } static void set_ir_ioapic_affinity_irq(unsigned int irq, @@ -2537,9 +2476,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {} #endif #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_x2apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); ack_x2APIC_irq(); + eoi_ioapic_irq(desc); } static void ack_x2apic_edge(unsigned int irq) -- cgit v1.2.3 From 29b61be65a33c95564fa82e7e8d60d97adb68ea8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:02 -0700 Subject: x86, x2apic: cleanup ifdef CONFIG_INTR_REMAP in io_apic code Impact: cleanup Clean up #ifdefs and replace them with helper functions. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 44 ++++++++++------------------------------- arch/x86/kernel/apic/probe_64.c | 2 -- 2 files changed, 10 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e074eac5bd3..cf27795c641 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -554,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq apic = entry->apic; pin = entry->pin; -#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -1419,9 +1415,8 @@ void __setup_vector_irq(int cpu) } static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip; -#endif +static struct irq_chip msi_ir_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1460,7 +1455,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t else desc->status &= ~IRQ_LEVEL; -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { desc->status |= IRQ_MOVE_PCNTXT; if (trigger) @@ -1472,7 +1466,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t handle_edge_irq, "edge"); return; } -#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, @@ -1493,7 +1487,6 @@ int setup_ioapic_entry(int apic_id, int irq, */ memset(entry,0,sizeof(*entry)); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); struct irte irte; @@ -1535,9 +1528,7 @@ int setup_ioapic_entry(int apic_id, int irq, * irq handler will do the explicit EOI to the io-apic. */ ir_entry->vector = pin; - } else -#endif - { + } else { entry->delivery_mode = apic->irq_delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->dest = destination; @@ -1662,10 +1653,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, { struct IO_APIC_route_entry entry; -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; -#endif memset(&entry, 0, sizeof(entry)); @@ -2395,6 +2384,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, set_ir_ioapic_affinity_irq_desc(desc, mask); } +#else +static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) +{ +} #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) @@ -2883,10 +2877,8 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); -#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2922,10 +2914,8 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) @@ -3219,9 +3209,7 @@ void destroy_irq(unsigned int irq) if (desc) desc->chip_data = cfg; -#ifdef CONFIG_INTR_REMAP free_irte(irq); -#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); @@ -3247,7 +3235,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irte irte; int ir_index; @@ -3273,9 +3260,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms MSI_ADDR_IR_SHV | MSI_ADDR_IR_INDEX1(ir_index) | MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { + } else { if (x2apic_enabled()) msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); @@ -3392,6 +3377,7 @@ static struct irq_chip msi_ir_chip = { #endif .retrigger = ioapic_retrigger_irq, }; +#endif /* * Map the PCI dev to the corresponding remapping hardware unit @@ -3419,7 +3405,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) } return index; } -#endif static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { @@ -3433,7 +3418,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irq_desc *desc = irq_to_desc(irq); /* @@ -3442,7 +3426,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) desc->status |= IRQ_MOVE_PCNTXT; set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else -#endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3456,11 +3439,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret, sub_handle; struct msi_desc *msidesc; unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; -#endif irq_want = nr_irqs_gsi; sub_handle = 0; @@ -3469,7 +3449,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; -#ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; @@ -3497,7 +3476,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) set_irte_irq(irq, iommu, index, sub_handle); } no_ir: -#endif ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; @@ -4032,11 +4010,9 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) set_ir_ioapic_affinity_irq_desc(desc, mask); else -#endif set_ioapic_affinity_irq_desc(desc, mask); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 8297c2b8ed2..1783652bb0e 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -69,14 +69,12 @@ void __init default_setup_apic_routing(void) printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } -#ifdef CONFIG_X86_X2APIC /* * Now that apic routing model is selected, configure the * fault handling for intr remapping. */ if (intr_remapping_enabled) enable_drhd_fault_handling(); -#endif } /* Same for both flat and physical. */ -- cgit v1.2.3 From 05c3dc2c4b60387769cbe73174347de4cf85f0c9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:03 -0700 Subject: x86, ioapic: Fix non atomic allocation with interrupts disabled Impact: fix possible race save_mask_IO_APIC_setup() was using non atomic memory allocation while getting called with interrupts disabled. Fix this by splitting this into two different function. Allocation part save_IO_APIC_setup() now happens before disabling interrupts. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 11 ++++++----- arch/x86/kernel/apic/io_apic.c | 34 +++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 699f8cf76bb..85eb8e10081 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1334,15 +1334,16 @@ void __init enable_IR_x2apic(void) return; } - local_irq_save(flags); - mask_8259A(); - - ret = save_mask_IO_APIC_setup(); + ret = save_IO_APIC_setup(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); goto end; } + local_irq_save(flags); + mask_IO_APIC_setup(); + mask_8259A(); + ret = enable_intr_remapping(1); if (ret && x2apic_preenabled) { @@ -1367,10 +1368,10 @@ end_restore: else reinit_intr_remapped_IO_APIC(x2apic_preenabled); -end: unmask_8259A(); local_irq_restore(flags); +end: if (!ret) { if (!x2apic_preenabled) pr_info("Enabled x2apic and interrupt-remapping\n"); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf27795c641..ff1759a1128 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -853,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup); static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; /* - * Saves and masks all the unmasked IO-APIC RTE's + * Saves all the IO-APIC RTE's */ -int save_mask_IO_APIC_setup(void) +int save_IO_APIC_setup(void) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -880,16 +880,9 @@ int save_mask_IO_APIC_setup(void) } for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + early_ioapic_entries[apic][pin] = ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } return 0; @@ -902,6 +895,25 @@ nomem: return -ENOMEM; } +void mask_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin]; + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + void restore_IO_APIC_setup(void) { int apic, pin; -- cgit v1.2.3 From 68a8ca593fac82e336a792226272455901fa83df Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 16 Mar 2009 17:05:04 -0700 Subject: x86: fix broken irq migration logic while cleaning up multiple vectors Impact: fix spurious IRQs During irq migration, we send a low priority interrupt to the previous irq destination. This happens in non interrupt-remapping case after interrupt starts arriving at new destination and in interrupt-remapping case after modifying and flushing the interrupt-remapping table entry caches. This low priority irq cleanup handler can cleanup multiple vectors, as multiple irq's can be migrated at almost the same time. While there will be multiple invocations of irq cleanup handler (one cleanup IPI for each irq migration), first invocation of the cleanup handler can potentially cleanup more than one vector (as the first invocation can see the requests for more than vector cleanup). When we cleanup multiple vectors during the first invocation of the smp_irq_move_cleanup_interrupt(), other vectors that are to be cleanedup can still be pending in the local cpu's IRR (as smp_irq_move_cleanup_interrupt() runs with interrupts disabled). When we are ready to unhook a vector corresponding to an irq, check if that vector is registered in the local cpu's IRR. If so skip that cleanup and do a self IPI with the cleanup vector, so that we give a chance to service the pending vector interrupt and then cleanup that vector allocation once we execute the lowest priority handler. This fixes spurious interrupts seen when migrating multiple vectors at the same time. [ This is apparently possible even on conventional xapic, although to the best of our knowledge it has never been seen. The stable maintainers may wish to consider this one for -stable. ] Signed-off-by: Suresh Siddha Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin Cc: stable@kernel.org --- arch/x86/kernel/apic/io_apic.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ff1759a1128..42cdc78427a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2414,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irq; + unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; @@ -2433,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } __get_cpu_var(vector_irq)[vector] = -1; cfg->move_cleanup_count--; unlock: -- cgit v1.2.3 From a6b6a14e0c60561f2902b078bd28d0e61defad70 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 18 Mar 2009 10:40:25 +1030 Subject: x86: use smp_call_function_single() in arch/x86/kernel/cpu/mcheck/mce_amd_64.c Attempting to rid us of the problematic work_on_cpu(). Just use smp_call_function_single() here. Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell LKML-Reference: <20090318042217.EF3F1DDF39@ozlabs.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 40 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index c5a32f92d07..7d01be86887 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -92,7 +92,8 @@ struct thresh_restart { }; /* must be called with correct cpu affinity */ -static long threshold_restart_bank(void *_tr) +/* Called via smp_call_function_single() */ +static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; @@ -119,7 +120,6 @@ static long threshold_restart_bank(void *_tr) mci_misc_hi |= MASK_COUNT_EN_HI; wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@ -279,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, tr.b = b; tr.reset = 0; tr.old_limit = 0; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } @@ -301,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b, tr.b = b; tr.reset = 0; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return end - buf; } -static long local_error_count(void *_b) +struct threshold_block_cross_cpu { + struct threshold_block *tb; + long retval; +}; + +static void local_error_count_handler(void *_tbcc) { - struct threshold_block *b = _b; + struct threshold_block_cross_cpu *tbcc = _tbcc; + struct threshold_block *b = tbcc->tb; u32 low, high; rdmsr(b->address, low, high); - return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); + tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); } static ssize_t show_error_count(struct threshold_block *b, char *buf) { - return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); + struct threshold_block_cross_cpu tbcc = { .tb = b, }; + + smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); + return sprintf(buf, "%lx\n", tbcc.retval); } static ssize_t store_error_count(struct threshold_block *b, @@ -325,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b, { struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - work_on_cpu(b->cpu, threshold_restart_bank, &tr); + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return 1; } @@ -394,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) return 0; - if (rdmsr_safe(address, &low, &high)) + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) return 0; if (!(high & MASK_VALID_HI)) { @@ -458,12 +467,11 @@ out_free: return err; } -static __cpuinit long local_allocate_threshold_blocks(void *_bank) +static __cpuinit long +local_allocate_threshold_blocks(int cpu, unsigned int bank) { - unsigned int *bank = _bank; - - return allocate_threshold_blocks(smp_processor_id(), *bank, 0, - MSR_IA32_MC0_MISC + *bank * 4); + return allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); } /* symlinks sibling shared banks to first core. first core owns dir/files. */ @@ -526,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); + err = local_allocate_threshold_blocks(cpu, bank); if (err) goto out_free; -- cgit v1.2.3 From ce4e240c279a31096f74afa6584a62d64a1ba8c8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 17 Mar 2009 10:16:54 -0800 Subject: x86: add x2apic_wrmsr_fence() to x2apic flush tlb paths Impact: optimize APIC IPI related barriers Uncached MMIO accesses for xapic are inherently serializing and hence we don't need explicit barriers for xapic IPI paths. x2apic MSR writes/reads don't have serializing semantics and hence need a serializing instruction or mfence, to make all the previous memory stores globally visisble before the x2apic msr write for IPI. Add x2apic_wrmsr_fence() in flush tlb path to x2apic specific paths. Signed-off-by: Suresh Siddha Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Jens Axboe Cc: Linus Torvalds Cc: "Paul E. McKenney" Cc: Rusty Russell Cc: Steven Rostedt Cc: "steiner@sgi.com" Cc: Nick Piggin LKML-Reference: <1237313814.27006.203.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 6 ++++++ arch/x86/kernel/apic/x2apic_phys.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8fb87b6dd63..4a903e2f0d1 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest( @@ -73,6 +75,8 @@ static void unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) @@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 23625b9f98b..a284359627e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), @@ -73,6 +75,8 @@ static void unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_cpu(query_cpu, mask) { if (query_cpu != this_cpu) @@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector) unsigned long query_cpu; unsigned long flags; + x2apic_wrmsr_fence(); + local_irq_save(flags); for_each_online_cpu(query_cpu) { if (query_cpu == this_cpu) -- cgit v1.2.3 From 2c74d66624ddbda8101d54d1e184cf9229b378bc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 18 Mar 2009 08:22:30 +1030 Subject: x86, uv: fix cpumask iterator in uv_bau_init() Impact: fix boot crash on UV systems Commit 76ba0ecda0de9accea9a91cb6dbde46782110e1c "cpumask: use cpumask_var_t in uv_flush_tlb_others" used cur_cpu as an iterator; it was supposed to be zero for the code below it. Reported-by: Cliff Wickman Original-From: Cliff Wickman Signed-off-by: Rusty Russell Acked-by: Mike Travis Cc: steiner@sgi.com Cc: LKML-Reference: <200903180822.31196.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d038b9c45cf..79c07324728 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -750,7 +750,7 @@ static int __init uv_bau_init(void) int node; int nblades; int last_blade; - int cur_cpu = 0; + int cur_cpu; if (!is_uv_system()) return 0; @@ -760,6 +760,7 @@ static int __init uv_bau_init(void) uv_mmask = (1UL << uv_hub_info->n_val) - 1; nblades = 0; last_blade = -1; + cur_cpu = 0; for_each_online_node(node) { blade = uv_node_to_blade_id(node); if (blade == last_blade) -- cgit v1.2.3 From 4e16c888754468a58d4829c2eba2c6aa3a065e2b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 18 Mar 2009 17:36:55 +0530 Subject: x86: cpu/mttr/cleanup.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kernel/cpu/mtrr/cleanup.c:197: warning: format ‘%d’ expects type ‘int’, but argument 2 has type ‘long unsigned int’ Signed-off-by: Jaswinder Singh Rajput Cc: Yinghai Lu LKML-Reference: <1237378015.13488.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index f4f89fbc228..ce0fe4b5c04 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -160,8 +160,9 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { - unsigned long i, base, size; + unsigned long base, size; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; -- cgit v1.2.3 From cde5edbda8ba7d600154ce4171125a48e4d2a21b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 18 Mar 2009 17:37:45 +0530 Subject: x86: kprobes.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kernel/kprobes.c:196: warning: passing argument 1 of ‘search_exception_tables’ makes integer from pointer without a cast Signed-off-by: Jaswinder Singh Rajput Cc: Linus Torvalds LKML-Reference:<49BED952.2050809@redhat.com> LKML-Reference: <1237378065.13488.2.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4558dd3918c..55b94614e34 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes) kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables(opcodes)) + if (search_exception_tables((unsigned long)opcodes)) return 0; /* Page fault may occur on this address. */ retry: -- cgit v1.2.3 From a6830278568a8bb9758aac152db15187741e0113 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 18 Mar 2009 20:42:28 +0530 Subject: x86: mpparse: clean up code by introducing a few helper functions Impact: cleanup Refactor the MP-table parsing code via the introduction of the following helper functions: skip_entry() smp_reserve_bootmem() check_irq_src() check_slot() To simplify the code flow and to reduce the size of the following oversized functions: smp_read_mpc(), smp_scan_config(). There should be no impact to functionality. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 261 +++++++++++++++++++++------------------------- 1 file changed, 120 insertions(+), 141 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 47673e02ae5..58ddf6259af 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m) } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } -#endif - -#ifdef CONFIG_X86_IO_APIC static int bad_ioapic(unsigned long address) { @@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m) if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} +#endif /* CONFIG_X86_IO_APIC */ -#endif static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { @@ -275,6 +276,12 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) return 1; } +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -310,55 +317,27 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: - { - struct mpc_cpu *m = (struct mpc_cpu *)mpt; - /* ACPI may have already provided this data */ - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)&mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; case MP_BUS: - { - struct mpc_bus *m = (struct mpc_bus *)mpt; -#ifdef CONFIG_X86_IO_APIC - MP_bus_info(m); -#endif - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + MP_bus_info((struct mpc_bus *)&mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; case MP_IOAPIC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; - MP_ioapic_info(m); -#endif - mpt += sizeof(struct mpc_ioapic); - count += sizeof(struct mpc_ioapic); - break; - } + MP_ioapic_info((struct mpc_ioapic *)&mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; case MP_INTSRC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - - MP_intsrc_info(m); -#endif - mpt += sizeof(struct mpc_intsrc); - count += sizeof(struct mpc_intsrc); - break; - } + MP_intsrc_info((struct mpc_intsrc *)&mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; case MP_LINTSRC: - { - struct mpc_lintsrc *m = - (struct mpc_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + MP_lintsrc_info((struct mpc_lintsrc *)&mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; default: /* wrong mptable */ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); @@ -689,6 +668,31 @@ void __init get_smp_config(void) __get_smp_config(0); } +static void smp_reserve_bootmem(struct mpf_intel *mpf) +{ + unsigned long size = get_mpc_size(mpf->physptr); +#ifdef CONFIG_X86_32 + /* + * We cannot access to MPC table to compute table size yet, + * as only few megabytes from the bottom is mapped now. + * PC-9800's MPC table places on the very last of physical + * memory; so that simply reserving PAGE_SIZE from mpf->physptr + * yields BUG() in reserve_bootmem. + * also need to make sure physptr is below than max_low_pfn + * we don't need reserve the area above max_low_pfn + */ + unsigned long end = max_low_pfn * PAGE_SIZE; + + if (mpf->physptr < end) { + if (mpf->physptr + size > end) + size = end - mpf->physptr; + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); + } +#else + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); +#endif +} + static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned reserve) { @@ -717,35 +721,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), - BOOTMEM_DEFAULT); - if (mpf->physptr) { - unsigned long size = get_mpc_size(mpf->physptr); -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute - * table size yet, as only few megabytes from - * the bottom is mapped now. - * PC-9800's MPC table places on the very last - * of physical memory; so that simply reserving - * PAGE_SIZE from mpf->physptr yields BUG() - * in reserve_bootmem. - * also need to make sure physptr is below than - * max_low_pfn - * we don't need reserve the area above max_low_pfn - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - - if (mpf->physptr < end) { - if (mpf->physptr + size > end) - size = end - mpf->physptr; - reserve_bootmem_generic(mpf->physptr, size, - BOOTMEM_DEFAULT); - } -#else - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); -#endif - } + if (mpf->physptr) + smp_reserve_bootmem(mpf); return 1; } @@ -848,7 +826,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m) #define SPARE_SLOT_NUM 20 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; -#endif + +static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_MP_intsrc_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} +#else /* CONFIG_X86_IO_APIC */ +static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, + int count) +{ + if (!mpc_new_phys) { + pr_info("No spare slots, try to append...take your risk, " + "new mpc_length %x\n", count); + } else { + if (count <= mpc_new_length) + pr_info("No spare slots, try to append..., " + "new mpc_length %x\n", count); + else { + pr_err("mpc_new_length %lx is too small\n", + mpc_new_length); + return -1; + } + } + + return 0; +} static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, @@ -856,71 +884,30 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, { #ifdef CONFIG_X86_IO_APIC int i; - int nr_m_spare = 0; #endif - int count = sizeof(*mpc); + int nr_m_spare = 0; unsigned char *mpt = ((unsigned char *)mpc) + count; printk(KERN_INFO "mpc_length %x\n", mpc->length); while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: - { - struct mpc_cpu *m = (struct mpc_cpu *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; case MP_BUS: - { - struct mpc_bus *m = (struct mpc_bus *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; case MP_IOAPIC: - { - mpt += sizeof(struct mpc_ioapic); - count += sizeof(struct mpc_ioapic); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; case MP_INTSRC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - - apic_printk(APIC_VERBOSE, "OLD "); - print_MP_intsrc_info(m); - i = get_MP_intsrc_index(m); - if (i > 0) { - assign_to_mpc_intsrc(&mp_irqs[i], m); - apic_printk(APIC_VERBOSE, "NEW "); - print_mp_irq_info(&mp_irqs[i]); - } else if (!i) { - /* legacy, do nothing */ - } else if (nr_m_spare < SPARE_SLOT_NUM) { - /* - * not found (-1), or duplicated (-2) - * are invalid entries, - * we need to use the slot later - */ - m_spare[nr_m_spare] = m; - nr_m_spare++; - } -#endif - mpt += sizeof(struct mpc_intsrc); - count += sizeof(struct mpc_intsrc); - break; - } + check_irq_src((struct mpc_intsrc *)&mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; case MP_LINTSRC: - { - struct mpc_lintsrc *m = - (struct mpc_lintsrc *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; default: /* wrong mptable */ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); @@ -950,16 +937,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!mpc_new_phys) { - printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); - else { - printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); - goto out; - } - } + if (!check_slot(mpc_new_phys, mpc_new_length, count)) + goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; mpt += sizeof(struct mpc_intsrc); -- cgit v1.2.3 From c58603e81b3ed4f1c7352e091fe43fd0bd8d06cc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Mar 2009 08:50:35 +0100 Subject: x86: mpparse: clean up code by introducing a few helper functions, fix Impact: fix boot crash This fixes commit a6830278568a8bb9758aac152db15187741e0113. Signed-off-by: Jaswinder Singh Rajput Cc: Yinghai Lu LKML-Reference: <1237403503.22438.21.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 58ddf6259af..290cb57f469 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -319,23 +319,23 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) case MP_PROCESSOR: /* ACPI may have already provided this data */ if (!acpi_lapic) - MP_processor_info((struct mpc_cpu *)&mpt); + MP_processor_info((struct mpc_cpu *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); break; case MP_BUS: - MP_bus_info((struct mpc_bus *)&mpt); + MP_bus_info((struct mpc_bus *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_bus)); break; case MP_IOAPIC: - MP_ioapic_info((struct mpc_ioapic *)&mpt); + MP_ioapic_info((struct mpc_ioapic *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); break; case MP_INTSRC: - MP_intsrc_info((struct mpc_intsrc *)&mpt); + MP_intsrc_info((struct mpc_intsrc *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); break; case MP_LINTSRC: - MP_lintsrc_info((struct mpc_lintsrc *)&mpt); + MP_lintsrc_info((struct mpc_lintsrc *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); break; default: @@ -902,7 +902,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); break; case MP_INTSRC: - check_irq_src((struct mpc_intsrc *)&mpt, &nr_m_spare); + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); break; case MP_LINTSRC: -- cgit v1.2.3 From 71ff49d71bb5cfcd2689b54cb433c0e6990a1d86 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Mar 2009 13:03:33 -0700 Subject: x86: with the last user gone, remove set_pte_present Impact: cleanup set_pte_present() is no longer used, directly or indirectly, so remove it. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Cc: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Marcelo Tosatti Cc: Avi Kivity LKML-Reference: <1237406613-2929-2-git-send-email-jeremy@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 7 ------- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/vmi_32.c | 6 ------ 3 files changed, 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 478bca986ec..33019ddb56b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) kvm_mmu_write(ptep, pte_val(pte)); } -static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - static void kvm_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -220,7 +214,6 @@ static void paravirt_ops_setup(void) #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; - pv_mmu_ops.set_pte_present = kvm_set_pte_present; pv_mmu_ops.pte_clear = kvm_pte_clear; pv_mmu_ops.pmd_clear = kvm_pmd_clear; #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 63dd358d8ee..8e45f446488 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = { #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, - .set_pte_present = native_set_pte_present, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, #endif diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 2cc4a90e2cb..95deb9f2211 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) vmi_ops.update_pte(ptep, VMI_PAGE_PT); } -static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); -} - static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ @@ -750,7 +745,6 @@ static inline int __init activate_vmi(void) pv_mmu_ops.set_pmd = vmi_set_pmd; #ifdef CONFIG_X86_PAE pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; - pv_mmu_ops.set_pte_present = vmi_set_pte_present; pv_mmu_ops.set_pud = vmi_set_pud; pv_mmu_ops.pte_clear = vmi_pte_clear; pv_mmu_ops.pmd_clear = vmi_pmd_clear; -- cgit v1.2.3 From 14fc9fbc700dc95b4f46ebd588169324fe6deff8 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 19 Mar 2009 10:56:29 -0700 Subject: x86: signal: check signal stack overflow properly Impact: cleanup Check alternate signal stack overflow with proper stack pointer. The stack pointer of the next signal frame is different if that task has i387 state. On x86_64, redzone would be included. No need to check SA_ONSTACK if we're already using alternate signal stack. Signed-off-by: Hiroshi Shimamoto Cc: Roland McGrath LKML-Reference: <49C2874D.3080002@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d2cc6428c58..dfcc74ab0ab 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -211,31 +211,27 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, { /* Default to using normal stack */ unsigned long sp = regs->sp; + int onsigstack = on_sig_stack(sp); #ifdef CONFIG_X86_64 /* redzone */ sp -= 128; #endif /* CONFIG_X86_64 */ - /* - * If we are on the alternate signal stack and would overflow it, don't. - * Return an always-bogus address instead so we will die with SIGSEGV. - */ - if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) - return (void __user *) -1L; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } else { + if (!onsigstack) { + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else { #ifdef CONFIG_X86_32 - /* This is the legacy signal stack switching. */ - if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) - sp = (unsigned long) ka->sa.sa_restorer; + /* This is the legacy signal stack switching. */ + if ((regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) + sp = (unsigned long) ka->sa.sa_restorer; #endif /* CONFIG_X86_32 */ + } } if (used_math()) { @@ -244,12 +240,22 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = round_down(sp, 64); #endif /* CONFIG_X86_64 */ *fpstate = (void __user *)sp; - - if (save_i387_xstate(*fpstate) < 0) - return (void __user *)-1L; } - return (void __user *)align_sigframe(sp - frame_size); + sp = align_sigframe(sp - frame_size); + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (onsigstack && !likely(on_sig_stack(sp))) + return (void __user *)-1L; + + /* save i387 state */ + if (used_math() && save_i387_xstate(*fpstate) < 0) + return (void __user *)-1L; + + return (void __user *)sp; } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 04c93ce4991fce731dab346d03964504339347db Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 20 Mar 2009 21:02:55 +0100 Subject: x86: fix IO APIC resource allocation error message Impact: fix incorrect error message - IO APIC resource allocation error message contains one too many "be". - Print the error message iff there are IO APICs in the system. I've seen this error message for some time on my x86-32 laptop... Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Alan Bartlett LKML-Reference: <200903202100.30789.bzolnier@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 42cdc78427a..d882c03604e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4130,9 +4130,12 @@ static int __init ioapic_insert_resources(void) struct resource *r = ioapic_resources; if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; + if (nr_ioapics > 0) { + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return -1; + } + return 0; } for (i = 0; i < nr_ioapics; i++) { -- cgit v1.2.3 From 5a5737eac224f01e264477954d92ed6e69047b7a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 13:28:39 +0530 Subject: x86: mpparse.c introduce smp_dump_mptable helper function smp_read_mpc() and replace_intsrc_all() can use same smp_dump_mptable() Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/mpparse.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 290cb57f469..4216d265366 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -282,6 +282,14 @@ static void skip_entry(unsigned char **ptr, int *count, int size) *count += size; } +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -340,10 +348,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) break; default: /* wrong mptable */ - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); - printk(KERN_ERR "type %x\n", *mpt); - print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->length, 1); + smp_dump_mptable(mpc, mpt); count = mpc->length; break; } @@ -910,10 +915,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, break; default: /* wrong mptable */ - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); - printk(KERN_ERR "type %x\n", *mpt); - print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->length, 1); + smp_dump_mptable(mpc, mpt); goto out; } } -- cgit v1.2.3 From 0b3ba0c3ccc7ced2a06fed405e80c8e1c77a3ee7 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 13:43:20 +0530 Subject: x86: mpparse.c introduce check_physptr helper function To reduce the size of the oversized function __get_smp_config() There should be no impact to functionality. Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/mpparse.c | 94 +++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 4216d265366..dce99dca6cf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -555,6 +555,55 @@ static unsigned long __init get_mpc_size(unsigned long physptr) return size; } +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + /* * Scan the memory blocks for an SMP configuration block. */ @@ -608,51 +657,8 @@ static void __init __get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - struct mpc_table *mpc; - unsigned long size; - - size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(mpc, early)) { -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 0; -#endif - printk(KERN_ERR - "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. " - "(tell your hw vendor)\n"); - early_iounmap(mpc, size); - return; - } - early_iounmap(mpc, size); - - if (early) + if (check_physptr(mpf, early)) return; -#ifdef CONFIG_X86_IO_APIC - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " - "using default mptable. " - "(tell your hw vendor)\n"); - - bus.type = MP_BUS; - bus.busid = 0; - memcpy(bus.bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } -#endif } else BUG(); -- cgit v1.2.3 From 271eb5c588e5d0a41eca6e118b6927af3f0f04b9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 16:55:24 +0530 Subject: x86: topology.c cleanup Impact: cleanup Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/topology.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0fcc95a354f..7e4515957a1 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,10 +25,10 @@ * * Send feedback to */ -#include -#include #include #include +#include +#include #include static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); @@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num) */ if (num) per_cpu(cpu_devices, num).cpu.hotpluggable = 1; + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } EXPORT_SYMBOL(arch_register_cpu); @@ -56,12 +57,13 @@ void arch_unregister_cpu(int num) unregister_cpu(&per_cpu(cpu_devices, num).cpu); } EXPORT_SYMBOL(arch_unregister_cpu); -#else +#else /* CONFIG_HOTPLUG_CPU */ + static int __init arch_register_cpu(int num) { return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } -#endif /*CONFIG_HOTPLUG_CPU*/ +#endif /* CONFIG_HOTPLUG_CPU */ static int __init topology_init(void) { @@ -70,11 +72,11 @@ static int __init topology_init(void) #ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); -#endif /* CONFIG_NUMA */ +#endif for_each_present_cpu(i) arch_register_cpu(i); + return 0; } - subsys_initcall(topology_init); -- cgit v1.2.3 From 390cd85c8ae4dc54ffba460a0e6575889170f56e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 16:55:45 +0530 Subject: x86: kdebugfs.c cleanup Impact: cleanup Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/kdebugfs.c | 82 ++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index ff7d3b0124f..e444357375c 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -8,11 +8,11 @@ */ #include #include -#include +#include #include +#include #include #include -#include #include @@ -26,9 +26,8 @@ struct setup_data_node { u32 len; }; -static ssize_t -setup_data_read(struct file *file, char __user *user_buf, size_t count, - loff_t *ppos) +static ssize_t setup_data_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { struct setup_data_node *node = file->private_data; unsigned long remain; @@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count, if (pos < 0) return -EINVAL; + if (pos >= node->len) return 0; if (count > node->len - pos) count = node->len - pos; + pa = node->paddr + sizeof(struct setup_data) + pos; pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); if (PageHighMem(pg)) { p = ioremap_cache(pa, count); if (!p) return -ENXIO; - } else { + } else p = __va(pa); - } remain = copy_to_user(user_buf, p, count); @@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count, static int setup_data_open(struct inode *inode, struct file *file) { file->private_data = inode->i_private; + return 0; } static const struct file_operations fops_setup_data = { - .read = setup_data_read, - .open = setup_data_open, + .read = setup_data_read, + .open = setup_data_open, }; static int __init @@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no, { struct dentry *d, *type, *data; char buf[16]; - int error; sprintf(buf, "%d", no); d = debugfs_create_dir(buf, parent); - if (!d) { - error = -ENOMEM; - goto err_return; - } + if (!d) + return -ENOMEM; + type = debugfs_create_x32("type", S_IRUGO, d, &node->type); - if (!type) { - error = -ENOMEM; + if (!type) goto err_dir; - } + data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); - if (!data) { - error = -ENOMEM; + if (!data) goto err_type; - } + return 0; err_type: debugfs_remove(type); err_dir: debugfs_remove(d); -err_return: - return error; + return -ENOMEM; } static int __init create_setup_data_nodes(struct dentry *parent) { struct setup_data_node *node; struct setup_data *data; - int error, no = 0; + int error = -ENOMEM; struct dentry *d; struct page *pg; u64 pa_data; + int no = 0; d = debugfs_create_dir("setup_data", parent); - if (!d) { - error = -ENOMEM; - goto err_return; - } + if (!d) + return -ENOMEM; pa_data = boot_params.hdr.setup_data; while (pa_data) { node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) { - error = -ENOMEM; + if (!node) goto err_dir; - } + pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); @@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = -ENXIO; goto err_dir; } - } else { + } else data = __va(pa_data); - } node->paddr = pa_data; node->type = data->type; @@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; no++; } + return 0; err_dir: debugfs_remove(d); -err_return: return error; } @@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = { static int __init boot_params_kdebugfs_init(void) { struct dentry *dbp, *version, *data; - int error; + int error = -ENOMEM; dbp = debugfs_create_dir("boot_params", NULL); - if (!dbp) { - error = -ENOMEM; - goto err_return; - } + if (!dbp) + return -ENOMEM; + version = debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version); - if (!version) { - error = -ENOMEM; + if (!version) goto err_dir; - } + data = debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob); - if (!data) { - error = -ENOMEM; + if (!data) goto err_version; - } + error = create_setup_data_nodes(dbp); if (error) goto err_data; + return 0; err_data: @@ -205,10 +196,9 @@ err_version: debugfs_remove(version); err_dir: debugfs_remove(dbp); -err_return: return error; } -#endif +#endif /* CONFIG_DEBUG_BOOT_PARAMS */ static int __init arch_kdebugfs_init(void) { -- cgit v1.2.3 From c8344bc218ce7ebc728337839698566a79edfe5a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 16:56:10 +0530 Subject: x86: i8253 cleanup Impact: cleanup - fix various style problems - fix header file issues Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/i8253.c | 68 ++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 10f92fb532f..3475440baa5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -3,17 +3,17 @@ * */ #include -#include #include +#include #include #include -#include +#include +#include +#include -#include -#include #include -#include #include +#include DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode, { spin_lock(&i8253_lock); - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_pit(0x34, PIT_MODE); @@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - * !using_apic_timer decisions in do_timer_interrupt_hook() */ -static struct clock_event_device pit_clockevent = { +static struct clock_event_device pit_ce = { .name = "pit", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, @@ -114,15 +114,13 @@ void __init setup_pit_timer(void) * Start pit with the boot cpu mask and make it global after the * IO_APIC has been initialized. */ - pit_clockevent.cpumask = cpumask_of(smp_processor_id()); - pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, - pit_clockevent.shift); - pit_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFF, &pit_clockevent); - pit_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &pit_clockevent); - clockevents_register_device(&pit_clockevent); - global_clock_event = &pit_clockevent; + pit_ce.cpumask = cpumask_of(smp_processor_id()); + pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift); + pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce); + pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce); + + clockevents_register_device(&pit_ce); + global_clock_event = &pit_ce; } #ifndef CONFIG_X86_64 @@ -133,11 +131,11 @@ void __init setup_pit_timer(void) */ static cycle_t pit_read(void) { + static int old_count; + static u32 old_jifs; unsigned long flags; int count; u32 jifs; - static int old_count; - static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); /* @@ -179,9 +177,9 @@ static cycle_t pit_read(void) * Previous attempts to handle these cases intelligently were * buggy, so we just do the simple thing now. */ - if (count > old_count && jifs == old_jifs) { + if (count > old_count && jifs == old_jifs) count = old_count; - } + old_count = count; old_jifs = jifs; @@ -192,13 +190,13 @@ static cycle_t pit_read(void) return (cycle_t)(jifs * LATCH) + count; } -static struct clocksource clocksource_pit = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, +static struct clocksource pit_cs = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, }; static void pit_disable_clocksource(void) @@ -206,9 +204,9 @@ static void pit_disable_clocksource(void) /* * Use mult to check whether it is registered or not */ - if (clocksource_pit.mult) { - clocksource_unregister(&clocksource_pit); - clocksource_pit.mult = 0; + if (pit_cs.mult) { + clocksource_unregister(&pit_cs); + pit_cs.mult = 0; } } @@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void) * - when local APIC timer is active (PIT is switched off) */ if (num_possible_cpus() > 1 || is_hpet_enabled() || - pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) + pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, - clocksource_pit.shift); - return clocksource_register(&clocksource_pit); + pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); + + return clocksource_register(&pit_cs); } arch_initcall(init_pit_clocksource); -#endif +#endif /* !CONFIG_X86_64 */ -- cgit v1.2.3 From 8383d821e7481f7cde9c17b61deb1b4af43b2cd9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 16:56:37 +0530 Subject: x86: rtc.c cleanup Impact: cleanup - fix various style problems - fix header file issues Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/rtc.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index dd6f2b71561..5d465b207e7 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -1,14 +1,14 @@ /* * RTC related functions */ +#include +#include #include #include -#include -#include #include -#include #include +#include #ifdef CONFIG_X86_32 /* @@ -16,9 +16,9 @@ * register we are working with. It is required for NMI access to the * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. */ -volatile unsigned long cmos_lock = 0; +volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); -#endif +#endif /* CONFIG_X86_32 */ /* For two digit years assume time is always after that */ #define CMOS_YEARS_OFFS 2000 @@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock); */ int mach_set_rtc_mmss(unsigned long nowtime) { - int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char save_control, save_freq_select; + int retval = 0; /* tell the clock it's being set */ save_control = CMOS_READ(RTC_CONTROL); @@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) real_seconds = bin2bcd(real_seconds); real_minutes = bin2bcd(real_minutes); } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); + CMOS_WRITE(real_seconds, RTC_SECONDS); + CMOS_WRITE(real_minutes, RTC_MINUTES); } else { printk(KERN_WARNING "set_rtc_mmss: can't update from %d to %d\n", @@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr) outb(addr, RTC_PORT(0)); val = inb(RTC_PORT(1)); lock_cmos_suffix(addr); + return val; } EXPORT_SYMBOL(rtc_cmos_read); @@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write); static int set_rtc_mmss(unsigned long nowtime) { - int retval; unsigned long flags; + int retval; spin_lock_irqsave(&rtc_lock, flags); retval = set_wallclock(nowtime); @@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void) platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); + return 0; } device_initcall(add_rtc_cmos); -- cgit v1.2.3 From d53a44446076a7dd68a4fb7f549fb6aeda9bfc2c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 16:57:04 +0530 Subject: x86: io_delay.c cleanup Impact: cleanup - fix header file issues Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/io_delay.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 720d2607aac..a979b5bd2fc 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -7,10 +7,10 @@ */ #include #include -#include #include +#include #include -#include +#include int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; @@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay); static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) { if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { - printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", - id->ident); + pr_notice("%s: using 0xed I/O delay port\n", id->ident); io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; } @@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { .callback = dmi_io_delay_0xed_port, .ident = "Compaq Presario V6000", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), - DMI_MATCH(DMI_BOARD_NAME, "30B7") + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") } }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion dv9000z", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), - DMI_MATCH(DMI_BOARD_NAME, "30B9") + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") } }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion dv6000", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), - DMI_MATCH(DMI_BOARD_NAME, "30B8") + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") } }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion tx1000", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), - DMI_MATCH(DMI_BOARD_NAME, "30BF") + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, { .callback = dmi_io_delay_0xed_port, .ident = "Presario F700", .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), - DMI_MATCH(DMI_BOARD_NAME, "30D3") + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") } }, { } -- cgit v1.2.3 From 1894e36754d682cc049b2b1c3825da8e585967d5 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 21 Mar 2009 17:01:25 +0530 Subject: x86: pci-nommu.c cleanup Impact: cleanup Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/pci-nommu.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index c70ab5a5d4c..8b02a3936d4 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -1,14 +1,14 @@ /* Fallback functions when the main IOMMU code is not compiled in. This code is roughly equivalent to i386. */ -#include -#include -#include -#include #include #include +#include +#include +#include +#include -#include #include +#include #include static int @@ -79,11 +79,11 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, } struct dma_mapping_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_single = nommu_map_single, - .map_sg = nommu_map_sg, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_single = nommu_map_single, + .map_sg = nommu_map_sg, + .is_phys = 1, }; void __init no_iommu_init(void) -- cgit v1.2.3 From 1cc185211a9cb913f6adbe3354e5c256f456ebd2 Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Sun, 22 Mar 2009 19:11:09 +0200 Subject: x86: Fix a couple of sparse warnings in arch/x86/kernel/apic/io_apic.c Impact: cleanup This patch fixes the following sparse warnings: arch/x86/kernel/apic/io_apic.c:3602:17: warning: symbol 'hpet_msi_type' was not declared. Should it be static? arch/x86/kernel/apic/io_apic.c:3467:30: warning: Using plain integer as NULL pointer Signed-off-by: Dmitri Vorobiev LKML-Reference: <1237741871-5827-2-git-send-email-dmitri.vorobiev@movial.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 42cdc78427a..ea97e5efa90 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3464,7 +3464,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret, sub_handle; struct msi_desc *msidesc; unsigned int irq_want; - struct intel_iommu *iommu = 0; + struct intel_iommu *iommu = NULL; int index = 0; irq_want = nr_irqs_gsi; @@ -3599,7 +3599,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ -struct irq_chip hpet_msi_type = { +static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, .mask = hpet_msi_mask, -- cgit v1.2.3 From f2362e6f1b9c5c168e5b4159afb4853ba467965e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 23 Mar 2009 02:06:51 +0530 Subject: x86: cpu/cpu.h cleanup Impact: cleanup - Fix various style issues Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/cpu/cpu.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 9469ecb5aeb..6de9a908e40 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -3,25 +3,25 @@ #define ARCH_X86_CPU_H struct cpu_model_info { - int vendor; - int family; - const char *model_names[16]; + int vendor; + int family; + const char *model_names[16]; }; /* attempt to consolidate cpu attributes */ struct cpu_dev { - const char * c_vendor; + const char *c_vendor; /* some have two possibilities for cpuid string */ - const char * c_ident[2]; + const char *c_ident[2]; struct cpu_model_info c_models[4]; - void (*c_early_init)(struct cpuinfo_x86 *c); - void (*c_init)(struct cpuinfo_x86 * c); - void (*c_identify)(struct cpuinfo_x86 * c); - unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); - int c_x86_vendor; + void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_init)(struct cpuinfo_x86 *); + void (*c_identify)(struct cpuinfo_x86 *); + unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); + int c_x86_vendor; }; #define cpu_dev_register(cpu_devX) \ -- cgit v1.2.3 From ce2d8bfd44c01fc9b22d64617b7e520e99095f33 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 23 Mar 2009 02:08:00 +0530 Subject: x86: irq.c use same path for show_interrupts Impact: cleanup SMP and !SMP will use same path for show_interrupts Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/irq.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b8ac3b6cf77..5e7c3e6f8f2 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -133,23 +133,15 @@ int show_interrupts(struct seq_file *p, void *v) return 0; spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); -#endif action = desc->action; if (!action && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); -- cgit v1.2.3 From 474e56b82c06cdbed468aea957805e0eb19d3510 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 23 Mar 2009 02:08:34 +0530 Subject: x86: irq.c keep CONFIG_X86_LOCAL_APIC interrupts together Impact: cleanup keep CONFIG_X86_LOCAL_APIC interrupts together to avoid extra ifdef Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/irq.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 5e7c3e6f8f2..3aaf7b9e3a8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -58,6 +58,11 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "PLT: "); @@ -90,12 +95,6 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "%*s: ", prec, "SPU"); - for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -166,6 +165,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; @@ -179,9 +179,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; #endif -#endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += irq_stats(cpu)->irq_spurious_count; #endif return sum; } -- cgit v1.2.3 From a1e38ca5ce1789de1bbd723e1e09de962e47ce18 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 23 Mar 2009 02:11:25 +0530 Subject: x86: apic/io_apic.c define msi_ir_chip and ir_ioapic_chip all the time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit move out msi_ir_chip and ir_ioapic_chip from CONFIG_INTR_REMAP shadow Fix: arch/x86/kernel/apic/io_apic.c:1431: warning: ‘msi_ir_chip’ defined but not used Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/apic/io_apic.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 42cdc78427a..d36e3d8be0f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1428,7 +1428,6 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -static struct irq_chip msi_ir_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -2663,20 +2662,20 @@ static struct irq_chip ioapic_chip __read_mostly = { .retrigger = ioapic_retrigger_irq, }; -#ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { .name = "IR-IO-APIC", .startup = startup_ioapic_irq, .mask = mask_IO_APIC_irq, .unmask = unmask_IO_APIC_irq, +#ifdef CONFIG_INTR_REMAP .ack = ack_x2apic_edge, .eoi = ack_x2apic_level, #ifdef CONFIG_SMP .set_affinity = set_ir_ioapic_affinity_irq, +#endif #endif .retrigger = ioapic_retrigger_irq, }; -#endif static inline void init_IO_APIC_traps(void) { @@ -3391,18 +3390,18 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -#ifdef CONFIG_INTR_REMAP static struct irq_chip msi_ir_chip = { .name = "IR-PCI-MSI", .unmask = unmask_msi_irq, .mask = mask_msi_irq, +#ifdef CONFIG_INTR_REMAP .ack = ack_x2apic_edge, #ifdef CONFIG_SMP .set_affinity = ir_set_msi_irq_affinity, +#endif #endif .retrigger = ioapic_retrigger_irq, }; -#endif /* * Map the PCI dev to the corresponding remapping hardware unit -- cgit v1.2.3 From ba639039d68cd978f4fa900a6533fe930609ed35 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Mon, 23 Mar 2009 02:13:01 +0530 Subject: x86: e820 fix various signedness issues in setup.c and e820.c Impact: cleanup This fixed various signedness issues in setup.c and e820.c: arch/x86/kernel/setup.c:455:53: warning: incorrect type in argument 3 (different signedness) arch/x86/kernel/setup.c:455:53: expected int *pnr_map arch/x86/kernel/setup.c:455:53: got unsigned int extern [toplevel] * arch/x86/kernel/setup.c:639:53: warning: incorrect type in argument 3 (different signedness) arch/x86/kernel/setup.c:639:53: expected int *pnr_map arch/x86/kernel/setup.c:639:53: got unsigned int extern [toplevel] * arch/x86/kernel/setup.c:820:54: warning: incorrect type in argument 3 (different signedness) arch/x86/kernel/setup.c:820:54: expected int *pnr_map arch/x86/kernel/setup.c:820:54: got unsigned int extern [toplevel] * arch/x86/kernel/e820.c:670:53: warning: incorrect type in argument 3 (different signedness) arch/x86/kernel/e820.c:670:53: expected int *pnr_map arch/x86/kernel/e820.c:670:53: got unsigned int [toplevel] * Signed-off-by: Jaswinder Singh Rajput --- arch/x86/kernel/e820.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index fb638d9ce6d..ef2c3563357 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -233,7 +233,7 @@ void __init e820_print_map(char *who) */ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - int *pnr_map) + u32 *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -552,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - int nr_map; + u32 nr_map; nr_map = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) @@ -563,7 +563,7 @@ void __init update_e820(void) } static void __init update_e820_saved(void) { - int nr_map; + u32 nr_map; nr_map = e820_saved.nr_map; if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) @@ -1303,7 +1303,7 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - int nr = e820.nr_map; + u32 nr = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); @@ -1386,7 +1386,7 @@ void __init e820_reserve_resources_late(void) char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; - int new_nr; + u32 new_nr; /* * Try to copy the BIOS-supplied E820-map. * -- cgit v1.2.3 From fa74c9073370e57fa28e02aff13f4d7b1806505c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Mar 2009 13:23:16 -0700 Subject: x86: fix set_extra_move_desc calling Impact: fix bug with irq-descriptor moving when logical flat Rusty observed: > The effect of setting desc->affinity (ie. from userspace via sysfs) has varied > over time. In 2.6.27, the 32-bit code anded the value with cpu_online_map, > and both 32 and 64-bit did that anding whenever a cpu was unplugged. > > 2.6.29 consolidated this into one routine (and fixed hotplug) but introduced > another variation: anding the affinity with cfg->domain. Is this right, or > should we just set it to what the user said? Or as now, indicate that we're > restricting it. Eric pointed out that desc->affinity should be what the user requested, if it is at all possible to honor the user space request. This bug got introduced by commit 22f65d31b "x86: Update io_apic.c to use new cpumask API". Fix it by moving the masking to before the descriptor moving ... Reported-by: Rusty Russell Reported-by: Eric W. Biederman LKML-Reference: <49C94134.4000408@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 86827d85488..1ed6c0600cd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -592,8 +592,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - cpumask_and(desc->affinity, cfg->domain, mask); + /* check that before desc->addinity get updated */ set_extra_move_desc(desc, mask); + cpumask_and(desc->affinity, cfg->domain, mask); return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask); } -- cgit v1.2.3 From f56e5034121c4911a155ba907076ab920754626d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Mar 2009 14:16:30 -0700 Subject: x86: use default_cpu_mask_to_apicid for 64bit Impact: cleanup Use online_mask directly on 64bit too. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49C94DAE.9070300@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f933822dba1..0014714ea97 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -159,20 +159,6 @@ static int flat_apic_id_registered(void) return physid_isset(read_xapic_id(), phys_cpu_present_map); } -static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; -} - -static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; - unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS; - - return mask1 & mask2; -} - static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { return hard_smp_processor_id() >> index_msb; @@ -213,8 +199,8 @@ struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, -- cgit v1.2.3 From e06b1b56f9bfcc91e1f175fe8d8bf3e35dafa080 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 24 Mar 2009 14:17:19 -0700 Subject: x86: Correct behaviour of irq affinity Impact: get correct smp_affinity as user requested The effect of setting desc->affinity (ie. from userspace via sysfs) has varied over time. In 2.6.27, the 32-bit code anded the value with cpu_online_map, and both 32 and 64-bit did that anding whenever a cpu was unplugged. 2.6.29 consolidated this into one routine (and fixed hotplug) but introduced another variation: anding the affinity with cfg->domain. We should just set it to what the user said - if possible. (cpu_mask_to_apicid_and already takes cpu_online_mask into account) Signed-off-by: Yinghai Lu Acked-by: "Eric W. Biederman" Cc: Andrew Morton LKML-Reference: <49C94DDF.2010703@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1ed6c0600cd..d990408ca06 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -594,9 +594,10 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) /* check that before desc->addinity get updated */ set_extra_move_desc(desc, mask); - cpumask_and(desc->affinity, cfg->domain, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask); + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); } static void -- cgit v1.2.3 From 70511134f61bd6e5eed19f767381f9fb3e762d49 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Mon, 23 Mar 2009 23:14:29 -0700 Subject: Revert "x86: don't compile vsmp_64 for 32bit" Partial revert of commit 129d8bc828e011bda0b7110a097bf3a0167f966e titled 'x86: don't compile vsmp_64 for 32bit' Commit reverted to compile vsmp_64.c if CONFIG_X86_64 is defined, since is_vsmp_box() needs to indicate that TSCs are not synchronized, and hence, not a valid time source, even when CONFIG_X86_VSMP is not defined. Signed-off-by: Ravikiran Thirumalai Cc: Yinghai Lu Cc: Andrew Morton Cc: shai@scalex86.org LKML-Reference: <20090324061429.GH7278@localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/vsmp_64.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e..6e9c1f320ac 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -70,7 +70,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -obj-$(CONFIG_X86_VSMP) += vsmp_64.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module_$(BITS).o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o @@ -120,4 +119,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o + obj-y += vsmp_64.o endif diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 74de562812c..a1d804bcd48 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -22,7 +22,7 @@ #include #include -#ifdef CONFIG_PARAVIRT +#if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' @@ -114,6 +114,7 @@ static void __init set_vsmp_pv_ops(void) } #endif +#ifdef CONFIG_PCI static int is_vsmp = -1; static void __init detect_vsmp_box(void) @@ -139,6 +140,15 @@ int is_vsmp_box(void) } } +#else +static void __init detect_vsmp_box(void) +{ +} +int is_vsmp_box(void) +{ + return 0; +} +#endif void __init vsmp_init(void) { detect_vsmp_box(); -- cgit v1.2.3