From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:32 +0100
Subject: x86, mce: disable machine checks on suspend

Impact: Bug fix

During suspend it is not reliable to process machine check
exceptions, because CPUs disappear but can still get machine check
broadcasts.  Also the system is slightly more likely to
machine check them, but the handler is typically not a position
to handle them in a meaningfull way.

So disable them during suspend and enable them during resume.

Also make sure they are always disabled on hot-unplugged CPUs.

This new code assumes that suspend always hotunplugs all
non BP CPUs.

v2: Remove the WARN_ONs Thomas objected to.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 25cf624eccb..5ed80991ab9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+	int i;
+
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+	return mce_disable();
+}
+
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -752,6 +775,8 @@ static void mce_restart(void)
 }
 
 static struct sysdev_class mce_sysclass = {
+	.suspend = mce_suspend,
+	.shutdown = mce_shutdown,
 	.resume = mce_resume,
 	.name = "machinecheck",
 };
-- 
cgit v1.2.3


From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:27 +0100
Subject: x86, mce: don't disable machine checks during code patching

Impact: low priority bug fix

This removes part of a a patch I added myself some time ago. After some
consideration the patch was a bad idea. In particular it stopped machine check
exceptions during code patching.

To quote the comment:

        * MCEs only happen when something got corrupted and in this
        * case we must do something about the corruption.
        * Ignoring it is worse than a unlikely patching race.
        * Also machine checks tend to be broadcast and if one CPU
        * goes into machine check the others follow quickly, so we don't
        * expect a machine check to cause undue problems during to code
        * patching.

So undo the machine check related parts of
8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled.

This only removes code, the only additions are a new comment.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/alternative.c       | 17 +++++++++++------
 arch/x86/kernel/cpu/mcheck/mce_32.c | 14 --------------
 arch/x86/kernel/cpu/mcheck/mce_64.c | 14 --------------
 3 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e..5b8394a3a6b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
 	   that might execute the to be patched code.
 	   Other CPUs are not running. */
 	stop_nmi();
-#ifdef CONFIG_X86_MCE
-	stop_mce();
-#endif
+
+	/*
+	 * Don't stop machine check exceptions while patching.
+	 * MCEs only happen when something got corrupted and in this
+	 * case we must do something about the corruption.
+	 * Ignoring it is worse than a unlikely patching race.
+	 * Also machine checks tend to be broadcast and if one CPU
+	 * goes into machine check the others follow quickly, so we don't
+	 * expect a machine check to cause undue problems during to code
+	 * patching.
+	 */
 
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
 				(unsigned long)__smp_locks_end);
 
 	restart_nmi();
-#ifdef CONFIG_X86_MCE
-	restart_mce();
-#endif
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 5ed80991ab9..25ccdbec86e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
-- 
cgit v1.2.3


From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:28 +0100
Subject: x86, mce: always use separate work queue to run trigger

Impact: Needed for bug fix in next patch

This relaxes the requirement that mce_notify_user has to run in process
context. Useful for future changes, but also leads to cleaner
behaviour now. Now instead mce_notify_user can be called directly
from interrupt (but not NMI) context.

The work queue only uses a single global work struct, which can be done safely
because it is always free to reuse before the trigger function is executed.
This way no events can be lost.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 25ccdbec86e..18b379cf061 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work)
 	schedule_delayed_work(&mcheck_work, next_interval);
 }
 
+static void mce_do_trigger(struct work_struct *work)
+{
+	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
 /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
  */
 int mce_notify_user(void)
 {
@@ -394,9 +400,14 @@ int mce_notify_user(void)
 		unsigned long now = jiffies;
 
 		wake_up_interruptible(&mce_wait);
-		if (trigger[0])
-			call_usermodehelper(trigger, trigger_argv, NULL,
-						UMH_NO_WAIT);
+
+		/*
+		 * There is no risk of missing notifications because
+		 * work_pending is always cleared before the function is
+		 * executed.
+		 */
+		if (trigger[0] && !work_pending(&mce_trigger_work))
+			schedule_work(&mce_trigger_work);
 
 		if (time_after_eq(now, last_print + (check_interval*HZ))) {
 			last_print = now;
-- 
cgit v1.2.3


From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:29 +0100
Subject: x86, mce: switch machine check polling to per CPU timer

Impact: Higher priority bug fix

The machine check poller runs a single timer and then broadcasted an
IPI to all CPUs to check them. This leads to unnecessary
synchronization between CPUs. The original CPU running the timer has
to wait potentially a long time for all other CPUs answering. This is
also real time unfriendly and in general inefficient.

This was especially a problem on systems with a lot of events where
the poller run with a higher frequency after processing some events.
There could be more and more CPU time wasted with this, to
the point of significantly slowing down machines.

The machine check polling is actually fully independent per CPU, so
there's no reason to not just do this all with per CPU timers.  This
patch implements that.

Also switch the poller also to use standard timers instead of work
queues. It was using work queues to be able to execute a user program
on a event, but mce_notify_user() handles this case now with a
separate callback. So instead always run the poll code in in a
standard per CPU timer, which means that in the common case of not
having to execute a trigger there will be less overhead.

This allows to clean up the initialization significantly, because
standard timers are already up when machine checks get init'ed.  No
multiple initialization functions.

Thanks to Thomas Gleixner for some help.

Cc: thockin@google.com
v2: Use del_timer_sync() on cpu shutdown and don't try to handle
migrated timers.
v3: Add WARN_ON for timer running on unexpected CPU

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 68 ++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 18b379cf061..3f0550d16f3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
+	struct timer_list *t = &per_cpu(mce_timer, data);
+
+	WARN_ON(smp_processor_id() != data);
+
 	if (mce_available(&current_cpu_data))
 		do_machine_check(NULL, 0);
-}
-
-static void mcheck_timer(struct work_struct *work)
-{
-	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work)
 				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	schedule_delayed_work(&mcheck_work, next_interval);
+	t->expires = jiffies + next_interval;
+	add_timer(t);
 }
 
 static void mce_do_trigger(struct work_struct *work)
@@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
-	idle_notifier_register(&mce_idle_notifier);
-	return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
 }
 __initcall(periodic_mcheck_init);
 
-
 /*
  * Initialize Machine Checks for a CPU.
  */
@@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void mce_init_timer(void)
+{
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+
+	/* data race harmless because everyone sets to the same value */
+	if (!next_interval)
+		next_interval = check_interval * HZ;
+	if (!next_interval)
+		return;
+	setup_timer(t, mcheck_timer, smp_processor_id());
+	t->expires = round_jiffies_relative(jiffies + next_interval);
+	add_timer(t);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
@@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 
 	mce_init(NULL);
 	mce_cpu_features(c);
+	mce_init_timer();
 }
 
 /*
@@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev)
 	return 0;
 }
 
+static void mce_cpu_restart(void *data)
+{
+	del_timer_sync(&__get_cpu_var(mce_timer));
+	if (mce_available(&current_cpu_data))
+		mce_init(NULL);
+	mce_init_timer();
+}
+
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	if (next_interval)
-		cancel_delayed_work(&mcheck_work);
-	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
+	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 static struct sysdev_class mce_sysclass = {
@@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
+	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		del_timer_sync(t);
+		break;
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		t->expires = round_jiffies_relative(jiffies + next_interval);
+		add_timer_on(t, cpu);
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3


From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:30 +0100
Subject: x86, mce: don't set up mce sysdev devices with mce=off

Impact: bug fix, in this case the resume handler shouldn't run which
	avoids incorrectly reenabling machine checks on resume

When MCEs are completely disabled on the command line don't set
up the sysdev devices for them either.

Includes a comment fix from Thomas Gleixner.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 3f0550d16f3..4e2b1bc5131 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 
 static int mce_available(struct cpuinfo_x86 *c)
 {
+	if (mce_dont_init)
+		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
 	mce_cpu_quirks(c);
 
-	if (mce_dont_init ||
-	    !mce_available(c))
+	if (!mce_available(c))
 		return;
 
 	mce_init(NULL);
@@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
-- 
cgit v1.2.3


From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:39:31 +0100
Subject: x86, mce: disable machine checks on offlined CPUs

Impact: Lower priority bug fix

Offlined CPUs could still get machine checks, but the machine check handler
cannot handle them properly, leading to an unconditional crash. Disable
machine checks on CPUs that are going down.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4e2b1bc5131..1db94c0d5aa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	cpu_clear(cpu, mce_device_initialized);
 }
 
+/* Make sure there are no machine checks on offlined CPUs. */
+static void __cpuexit mce_disable_cpu(void *h)
+{
+	int i;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void __cpuexit mce_reenable_cpu(void *h)
+{
+	int i;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
@@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		del_timer_sync(t);
+		smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		t->expires = round_jiffies_relative(jiffies + next_interval);
 		add_timer_on(t, cpu);
+		smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 12 Feb 2009 13:39:34 +0100
Subject: x86, mce: fix a race condition in mce_read()

Impact: bugfix

Considering the situation as follow:

before: mcelog.next == 1, mcelog.entry[0].finished = 1

+--------------------------------------------------------------------------
R                   W1                  W2                  W3

read mcelog.next (1)
                    mcelog.next++ (2)
                    (working on entry 1,
                    finished == 0)

mcelog.next = 0
                                        mcelog.next++ (1)
                                        (working on entry 0)
                                                           mcelog.next++ (2)
                                                           (working on entry 1)
                        <----------------- race ---------------->
                    (done on entry 1,
                    finished = 1)
                                                           (done on entry 1,
                                                           finished = 1)

To fix the race condition, a cmpxchg loop is added to mce_read() to
ensure no new MCE record can be added between mcelog.next reading and
mcelog.next = 0.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 41 ++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 1db94c0d5aa..870d08deccf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
 	unsigned long *cpu_tsc;
 	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned next;
+	unsigned prev, next;
 	char __user *buf = ubuf;
 	int i, err;
 
@@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	}
 
 	err = 0;
-	for (i = 0; i < next; i++) {
-		unsigned long start = jiffies;
-
-		while (!mcelog.entry[i].finished) {
-			if (time_after_eq(jiffies, start + 2)) {
-				memset(mcelog.entry + i,0, sizeof(struct mce));
-				goto timeout;
+	prev = 0;
+	do {
+		for (i = prev; i < next; i++) {
+			unsigned long start = jiffies;
+
+			while (!mcelog.entry[i].finished) {
+				if (time_after_eq(jiffies, start + 2)) {
+					memset(mcelog.entry + i, 0,
+					       sizeof(struct mce));
+					goto timeout;
+				}
+				cpu_relax();
 			}
-			cpu_relax();
+			smp_rmb();
+			err |= copy_to_user(buf, mcelog.entry + i,
+					    sizeof(struct mce));
+			buf += sizeof(struct mce);
+timeout:
+			;
 		}
-		smp_rmb();
-		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-		buf += sizeof(struct mce);
- timeout:
-		;
-	}
 
-	memset(mcelog.entry, 0, next * sizeof(struct mce));
-	mcelog.next = 0;
+		memset(mcelog.entry + prev, 0,
+		       (next - prev) * sizeof(struct mce));
+		prev = next;
+		next = cmpxchg(&mcelog.next, prev, 0);
+	} while (next != prev);
 
 	synchronize_sched();
 
-- 
cgit v1.2.3


From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 17 Feb 2009 23:07:13 +0100
Subject: x86, mce: implement dynamic machine check banks support

Impact: cleanup; making code future proof; memory saving on small systems

This patch replaces the hardcoded max number of machine check banks with
dynamic allocation depending on what the CPU reports. The sysfs
data structures and the banks array are dynamically allocated.

There is still a hard bank limit (128) because the mcelog protocol uses
banks >= 128 as pseudo banks to escape other events. But we expect
that 128 banks is beyond any reasonable CPU for now.

This supersedes an earlier patch by Venki, but it solves the problem
more completely by making the limit fully dynamic (up to the 128
boundary).

This saves some memory on machines with less than 6 banks because
they won't need sysdevs for unused ones and also allows to
use sysfs to control these banks on possible future CPUs with
more than 6 banks.

This is an updated patch addressing Venki's comments.  I also added in
another patch from Thomas which fixed the error allocation path (that
patch was previously separated)

Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 147 ++++++++++++++++++++++++++++--------
 1 file changed, 115 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 870d08deccf..2297730bb51 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -24,6 +24,8 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +34,12 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
+
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
 
 atomic_t mce_entry;
 
@@ -47,7 +54,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS && !bank[i])
+		if (!bank[i])
 			continue;
 
 		m.misc = 0;
@@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init);
 /*
  * Initialize Machine Checks for a CPU.
  */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
 	u64 cap;
-	int i;
+	unsigned b;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	banks = cap & 0xff;
-	if (banks > MCE_EXTENDED_BANK) {
-		banks = MCE_EXTENDED_BANK;
-		printk(KERN_INFO "MCE: warning: using only %d banks\n",
-		       MCE_EXTENDED_BANK);
+	b = cap & 0xff;
+	if (b > MAX_NR_BANKS) {
+		printk(KERN_WARNING
+		       "MCE: Using only %u machine check banks out of %u\n",
+			MAX_NR_BANKS, b);
+		b = MAX_NR_BANKS;
+	}
+
+	/* Don't support asymmetric configurations today */
+	WARN_ON(banks != 0 && b != banks);
+	banks = b;
+	if (!bank) {
+		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+		if (!bank)
+			return -ENOMEM;
+		memset(bank, 0xff, banks * sizeof(u64));
 	}
+
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
+	return 0;
+}
+
+static void mce_init(void *dummy)
+{
+	u64 cap;
+	int i;
+
 	/* Log the machine checks left over from the previous reset.
 	   This also clears all registers */
 	do_machine_check(NULL, mce_bootlog ? -1 : -2);
 
 	set_in_cr4(X86_CR4_MCE);
 
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS)
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-		else
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
@@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if(c->x86 == 15)
+		if (c->x86 == 15 && banks > 4)
 			/* disable GART TBL walk error reporting, which trips off
 			   incorrectly with the IOMMU & 3ware & Cerberus. */
-			clear_bit(10, &bank[4]);
+			clear_bit(10, (unsigned long *)&bank[4]);
 		if(c->x86 <= 17 && mce_bootlog < 0)
 			/* Lots of broken BIOS around that don't clear them
 			   by default and leave crap in there. Don't log. */
@@ -532,11 +556,15 @@ static void mce_init_timer(void)
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-	mce_cpu_quirks(c);
-
 	if (!mce_available(c))
 		return;
 
+	if (mce_cap_init() < 0) {
+		mce_dont_init = 1;
+		return;
+	}
+	mce_cpu_quirks(c);
+
 	mce_init(NULL);
 	mce_cpu_features(c);
 	mce_init_timer();
@@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			 char *buf)
+{
+	u64 b = bank[attr - bank_attrs];
+	return sprintf(buf, "%Lx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			const char *buf, size_t siz)
+{
+	char *end;
+	u64 new = simple_strtoull(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	bank[attr - bank_attrs] = new;
+	mce_restart();
+	return end-buf;
+}
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 				char *buf)
@@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
@@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		if (err)
 			goto error;
 	}
+	for (i = 0; i < banks; i++) {
+		err = sysdev_create_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+		if (err)
+			goto error2;
+	}
 	cpu_set(cpu, mce_device_initialized);
 
 	return 0;
+error2:
+	while (--i >= 0) {
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+	}
 error:
-	while (i--) {
+	while (--i >= 0) {
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 				   mce_attributes[i]);
 	}
@@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 			mce_attributes[i]);
+	for (i = 0; i < banks; i++)
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+			&bank_attrs[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 	cpu_clear(cpu, mce_device_initialized);
 }
@@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
+static __init int mce_init_banks(void)
+{
+	int i;
+
+	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+				GFP_KERNEL);
+	if (!bank_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < banks; i++) {
+		struct sysdev_attribute *a = &bank_attrs[i];
+		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+		if (!a->attr.name)
+			goto nomem;
+		a->attr.mode = 0644;
+		a->show = show_bank;
+		a->store = set_bank;
+	}
+	return 0;
+
+nomem:
+	while (--i >= 0)
+		kfree(bank_attrs[i].attr.name);
+	kfree(bank_attrs);
+	bank_attrs = NULL;
+	return -ENOMEM;
+}
+
 static __init int mce_init_device(void)
 {
 	int err;
@@ -980,6 +1058,11 @@ static __init int mce_init_device(void)
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
+
+	err = mce_init_banks();
+	if (err)
+		return err;
+
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:43:22 +0100
Subject: x86, mce: factor out duplicated struct mce setup into one function

Impact: cleanup

This merely factors out duplicated code to set up
the initial struct mce state into a single function.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c       | 23 ++++++++++++++---------
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c   |  4 +---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 2297730bb51..fed875742b1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+	memset(m, 0, sizeof(struct mce));
+	m->cpu = smp_processor_id();
+	rdtscll(m->tsc);
+}
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	    || !banks)
 		goto out2;
 
-	memset(&m, 0, sizeof(struct mce));
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
+
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 		if ((m.status & MCI_STATUS_VAL) == 0)
@@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code >= 0)
-			rdtscll(m.tsc);
+		if (error_code < 0)
+			m.tsc = 0;
 		if (error_code != -2)
 			mce_log(&m);
 
@@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
 	struct mce m;
 
-	memset(&m, 0, sizeof(m));
-	m.cpu = cpu;
+	mce_setup(&m);
 	m.bank = MCE_THERMAL_BANK;
 	m.status = status;
-	rdtscll(m.tsc);
 	mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094..75d9dd25e3d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void)
 	exit_idle();
 	irq_enter();
 
-	memset(&m, 0, sizeof(m));
-	rdtscll(m.tsc);
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd3..7f7f1015ef1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 	if (therm_throt_process(msr_val & 1))
-		mce_log_therm_throt_event(smp_processor_id(), msr_val);
+		mce_log_therm_throt_event(msr_val);
 
 	inc_irq_stat(irq_thermal_count);
 	irq_exit();
-- 
cgit v1.2.3


From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:43:23 +0100
Subject: x86, mce: separate correct machine check poller and fatal exception
 handler

Impact: cleanup, performance enhancement

The machine check poller is diverging more and more from the fatal
exception handler. Instead of adding more special cases separate the code
paths completely. The corrected poll path is actually quite simple,
and this doesn't result in much code duplication.

This makes both handlers much easier to read and results in
cleaner code flow.  The exception handler now only needs to care
about uncorrected errors, which also simplifies the handling of multiple
errors. The corrected poller also now always runs in standard interrupt
context and does not need to do anything special to handle NMI context.

Minor behaviour changes:
- MCG status is now not cleared on polling.
- Only the banks which had corrected errors get cleared on polling
- The exception handler only clears banks with errors now

v2: Forward port to new patch order. Add "uc" argument.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c     | 129 ++++++++++++++++++++++++++------
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c |   2 +-
 2 files changed, 109 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fed875742b1..268b05edade 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags)
+{
+	struct mce m;
+	int i;
+
+	mce_setup(&m);
+
+	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	for (i = 0; i < banks; i++) {
+		if (!bank[i])
+			continue;
+
+		m.misc = 0;
+		m.addr = 0;
+		m.bank = i;
+		m.tsc = 0;
+
+		barrier();
+		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		if (!(m.status & MCI_STATUS_VAL))
+			continue;
+
+		/*
+		 * Uncorrected events are handled by the exception handler
+		 * when it is enabled. But when the exception is disabled log
+		 * everything.
+		 *
+		 * TBD do the same check for MCI_STATUS_EN here?
+		 */
+		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+			continue;
+
+		if (m.status & MCI_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+		if (m.status & MCI_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+		if (!(flags & MCP_TIMESTAMP))
+			m.tsc = 0;
+		/*
+		 * Don't get the IP here because it's unlikely to
+		 * have anything to do with the actual error location.
+		 */
+
+		mce_log(&m);
+		add_taint(TAINT_MACHINE_CHECK);
+
+		/*
+		 * Clear state for this bank.
+		 */
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
+
+	/*
+	 * Don't clear MCG_STATUS here because it's only defined for
+	 * exceptions.
+	 */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	 * error.
 	 */
 	int kill_it = 0;
+	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
 	atomic_inc(&mce_entry);
 
-	if ((regs
-	     && notify_die(DIE_NMI, "machine check", regs, error_code,
+	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-	    || !banks)
+		goto out2;
+	if (!banks)
 		goto out2;
 
 	mce_setup(&m);
@@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
+		__clear_bit(i, toclear);
 		if (!bank[i])
 			continue;
 
@@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
+		/*
+		 * Non uncorrected errors are handled by machine_check_poll
+		 * Leave them alone.
+		 */
+		if ((m.status & MCI_STATUS_UC) == 0)
+			continue;
+
+		/*
+		 * Set taint even when machine check was not enabled.
+		 */
+		add_taint(TAINT_MACHINE_CHECK);
+
+		__set_bit(i, toclear);
+
 		if (m.status & MCI_STATUS_EN) {
 			/* if PCC was set, there's no way out */
 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 					no_way_out = 1;
 				kill_it = 1;
 			}
+		} else {
+			/*
+			 * Machine check event was not enabled. Clear, but
+			 * ignore.
+			 */
+			continue;
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code < 0)
-			m.tsc = 0;
-		if (error_code != -2)
-			mce_log(&m);
+		mce_log(&m);
 
 		/* Did this bank cause the exception? */
 		/* Assume that the bank with uncorrectable errors did it,
@@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			panicm = m;
 			panicm_found = 1;
 		}
-
-		add_taint(TAINT_MACHINE_CHECK);
 	}
 
-	/* Never do anything final in the polling timer */
-	if (!regs)
-		goto out;
-
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
@@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
- out:
 	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	for (i = 0; i < banks; i++) {
+		if (test_bit(i, toclear))
+			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data)
 	WARN_ON(smp_processor_id() != data);
 
 	if (mce_available(&current_cpu_data))
-		do_machine_check(NULL, 0);
+		machine_check_poll(MCP_TIMESTAMP);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -494,9 +580,10 @@ static void mce_init(void *dummy)
 	u64 cap;
 	int i;
 
-	/* Log the machine checks left over from the previous reset.
-	   This also clears all registers */
-	do_machine_check(NULL, mce_bootlog ? -1 : -2);
+	/*
+	 * Log the machine checks left over from the previous reset.
+	 */
+	machine_check_poll(MCP_UC);
 
 	set_in_cr4(X86_CR4_MCE);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 75d9dd25e3d..0069c653f4e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			do_machine_check(NULL, 0);
+			machine_check_poll(MCP_TIMESTAMP);
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
-- 
cgit v1.2.3


From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 19 Feb 2009 15:44:58 -0800
Subject: x86, mce: use %ll instead of %L for 64-bit numbers

Impact: Cleanup

The standard spelling of a printf pattern for long long is "ll", not
"L", which is for long double.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 268b05edade..60a114ca14f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -136,11 +136,11 @@ static void print_mce(struct mce *m)
 			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
-	printk(KERN_EMERG "TSC %Lx ", m->tsc);
+	printk(KERN_EMERG "TSC %llx ", m->tsc);
 	if (m->addr)
-		printk("ADDR %Lx ", m->addr);
+		printk("ADDR %llx ", m->addr);
 	if (m->misc)
-		printk("MISC %Lx ", m->misc);
+		printk("MISC %llx ", m->misc);
 	printk("\n");
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 			 char *buf)
 {
 	u64 b = bank[attr - bank_attrs];
-	return sprintf(buf, "%Lx\n", b);
+	return sprintf(buf, "%llx\n", b);
 }
 
 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
-- 
cgit v1.2.3


From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 23 Feb 2009 14:01:04 -0800
Subject: x86, mce: remove invalid __cpuinit/__cpuexit annotations

Impact: Bug fix when CPU hotplug is disabled

Correct the following broken __cpuinit/__cpuexit annotations:

- mce_cpu_features() is called from mce_resume(), and so cannot be
  __cpuinit.
- mce_disable_cpu() and mce_reenable_cpu() are called from
  mce_cpu_callback(), and so cannot be __cpuexit().

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 60a114ca14f..0625993bf95 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -598,7 +598,7 @@ static void mce_init(void *dummy)
 }
 
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
@@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuexit mce_disable_cpu(void *h)
+static void mce_disable_cpu(void *h)
 {
 	int i;
 
@@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 }
 
-static void __cpuexit mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void *h)
 {
 	int i;
 
-- 
cgit v1.2.3


From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:30 +0100
Subject: x86, mce, cmci: export MAX_NR_BANKS

Impact: Cleanup (code movement)

Move MAX_NR_BANKS into mce.h because it's needed there
for followup patches.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index a4a7c686ce9..39f8bb525a7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -37,12 +37,6 @@
 
 #define MISC_MCELOG_MINOR 227
 
-/*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
- */
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
-
 atomic_t mce_entry;
 
 static int mce_dont_init;
-- 
cgit v1.2.3


From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:31 +0100
Subject: x86, mce, cmci: factor out threshold interrupt handler

Impact: cleanup; preparation for feature

The mce_amd_64 code has an own private MC threshold vector with an own
interrupt handler. Since Intel needs a similar handler
it makes sense to share the vector because both can not
be active at the same time.

I factored the common APIC handler code into a separate file which can
be used by both the Intel or AMD MC code.

This is needed for the next patch which adds an Intel specific
CMCI handler.

This patch should be a nop for AMD, it just moves some code
around.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/Makefile     |  1 +
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 15 ++++++---------
 arch/x86/kernel/cpu/mcheck/threshold.c  | 24 ++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mcheck/threshold.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index e82c8208b81..49705be9820 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
+static void amd_threshold_interrupt(void);
+
 /*
  * CPU Initialization
  */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			tr.reset = 0;
 			tr.old_limit = 0;
 			threshold_restart_bank(&tr);
+
+			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
 }
@@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
 	unsigned int bank, block;
 	struct mce m;
 	u32 low = 0, high = 0, address = 0;
 
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
 	mce_setup(&m);
 
 	/* assume first bank caused it */
@@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void)
 				       + bank * NR_BLOCKS
 				       + block;
 				mce_log(&m);
-				goto out;
+				return;
 			}
 		}
 	}
-out:
-	inc_irq_stat(irq_threshold_count);
-	irq_exit();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..4319142413d
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,24 @@
+/* Common corrected MCE threshold handler code */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <asm/mce.h>
+#include <asm/irq_vectors.h>
+#include <asm/idle.h>
+
+static void default_threshold_interrupt(void)
+{
+	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+			 THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+	ack_APIC_irq();
+	exit_idle();
+	irq_enter();
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
+	irq_exit();
+}
-- 
cgit v1.2.3


From f9695df42cdbca78530b4458c38ecfdd0bb90079 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:32 +0100
Subject: x86, mce, cmci: avoid potential reentry of threshold interrupt

Impact: minor bugfix

The threshold handler on AMD (and soon on Intel) could be theoretically
reentered by the hardware. This could lead to corrupted events
because the machine check poll code assumes it is not reentered.

Move the APIC ACK to the end of the interrupt handler to let
the hardware avoid that.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/threshold.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 4319142413d..e4b8a3833fc 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -15,10 +15,11 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
 asmlinkage void mce_threshold_interrupt(void)
 {
-	ack_APIC_irq();
 	exit_idle();
 	irq_enter();
 	inc_irq_stat(irq_threshold_count);
 	mce_threshold_vector();
 	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
 }
-- 
cgit v1.2.3


From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:33 +0100
Subject: x86, mce: replace machine check events logged interval with ratelimit

Impact: behavior change, use common code

Use a standard leaky bucket ratelimit for the machine check
warning print interval instead of waiting every check_interval.
Also decrease the limit to twice per minute.
This interacts better with threshold interrupts because
they can happen more often than check_interval.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 39f8bb525a7..9017609cadd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -28,6 +28,7 @@
 #include <linux/kdebug.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  */
 int mce_notify_user(void)
 {
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
 	clear_thread_flag(TIF_MCE_NOTIFY);
 	if (test_and_clear_bit(0, &notify_user)) {
-		static unsigned long last_print;
-		unsigned long now = jiffies;
-
 		wake_up_interruptible(&mce_wait);
 
 		/*
@@ -503,10 +504,8 @@ int mce_notify_user(void)
 		if (trigger[0] && !work_pending(&mce_trigger_work))
 			schedule_work(&mce_trigger_work);
 
-		if (time_after_eq(now, last_print + (check_interval*HZ))) {
-			last_print = now;
+		if (__ratelimit(&ratelimit))
 			printk(KERN_INFO "Machine check events logged\n");
-		}
 
 		return 1;
 	}
-- 
cgit v1.2.3


From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:34 +0100
Subject: x86, mce, cmci: use polled banks bitmap in machine check poller

Define a per cpu bitmap that contains the banks polled by the machine
check poller. This is needed for the CMCI code in the next patches
to be able to disable polling on specific banks.

The bank by default contains all banks, so there is no behaviour
change. Only future code will remove some banks from the polling
set.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c     | 16 ++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c |  3 ++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 9017609cadd..a8ff38bfa6e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  *
  * This is executed in standard interrupt context.
  */
-void machine_check_poll(enum mcp_flags flags)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
 	struct mce m;
 	int i;
@@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags)
 
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	for (i = 0; i < banks; i++) {
-		if (!bank[i])
+		if (!bank[i] || !test_bit(i, *b))
 			continue;
 
 		m.misc = 0;
@@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data)
 	WARN_ON(smp_processor_id() != data);
 
 	if (mce_available(&current_cpu_data))
-		machine_check_poll(MCP_TIMESTAMP);
+		machine_check_poll(MCP_TIMESTAMP,
+				&__get_cpu_var(mce_poll_banks));
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -572,11 +578,13 @@ static void mce_init(void *dummy)
 {
 	u64 cap;
 	int i;
+	mce_banks_t all_banks;
 
 	/*
 	 * Log the machine checks left over from the previous reset.
 	 */
-	machine_check_poll(MCP_UC);
+	bitmap_fill(all_banks, MAX_NR_BANKS);
+	machine_check_poll(MCP_UC, &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 49705be9820..ee8bfcd3aa3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			machine_check_poll(MCP_TIMESTAMP);
+			machine_check_poll(MCP_TIMESTAMP,
+					&__get_cpu_var(mce_poll_banks));
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
-- 
cgit v1.2.3


From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:36 +0100
Subject: x86, mce, cmci: add CMCI support

Impact: Major new feature

Intel CMCI (Corrected Machine Check Interrupt) is a new
feature on Nehalem CPUs. It allows the CPU to trigger
interrupts on corrected events, which allows faster
reaction to them instead of with the traditional
polling timer.

Also use CMCI to discover shared banks. Machine check banks
can be shared by CPU threads or even cores. Using the CMCI enable
bit it is possible to detect the fact that another CPU already
saw a specific bank. Use this to assign shared banks only
to one CPU to avoid reporting duplicated events.

On CPU hot unplug bank sharing is re discovered. This is done
using a thread that cycles through all the CPUs.

To avoid races between the poller and CMCI we only poll
for banks that are not CMCI capable and only check CMCI
owned banks on a interrupt.

The shared banks ownership information is currently only used for
CMCI interrupts, not polled banks.

The sharing discovery code follows the algorithm recommended in the
IA32 SDM Vol3a 14.5.2.1

The CMCI interrupt handler just calls the machine check poller to
pick up the machine check event that caused the interrupt.

I decided not to implement a separate threshold event like
the AMD version has, because the threshold is always one currently
and adding another event didn't seem to add any value.

Some code inspired by Yunhong Jiang's Xen implementation,
which was in term inspired by a earlier CMCI implementation
by me.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c       |  16 ++-
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 205 ++++++++++++++++++++++++++++++
 2 files changed, 218 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index a8ff38bfa6e..bfbd5323a63 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 	panic(msg);
 }
 
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mce_dont_init)
 		return 0;
@@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 static void mce_disable_cpu(void *h)
 {
 	int i;
+	unsigned long action = *(unsigned long *)h;
 
 	if (!mce_available(&current_cpu_data))
 		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_clear();
 	for (i = 0; i < banks; i++)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 }
@@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h)
 static void mce_reenable_cpu(void *h)
 {
 	int i;
+	unsigned long action = *(unsigned long *)h;
 
 	if (!mce_available(&current_cpu_data))
 		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_reenable();
 	for (i = 0; i < banks; i++)
 		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
 }
@@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		del_timer_sync(t);
-		smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
+		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		t->expires = round_jiffies_relative(jiffies + next_interval);
 		add_timer_on(t, cpu);
-		smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
+		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+		break;
+	case CPU_POST_DEAD:
+		/* intentionally ignoring frozen here */
+		cmci_rediscover(cpu);
 		break;
 	}
 	return NOTIFY_OK;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 1b1491a76b5..a518ec8c6f8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -12,6 +14,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static __cpuinit int cmci_supported(int *banks)
+{
+	u64 cap;
+
+	/*
+	 * Vendor check is not strictly needed, but the initial
+	 * initialization is vendor keyed and this
+	 * makes sure none of the backdoors are entered otherwise.
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+		return 0;
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+	return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+	if (*hdr == 0)
+		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+	*hdr = 1;
+	printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static __cpuinit void cmci_discover(int banks, int boot)
+{
+	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+	int hdr = 0;
+	int i;
+
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		u64 val;
+
+		if (test_bit(i, owned))
+			continue;
+
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Already owned by someone else? */
+		if (val & CMCI_EN) {
+			if (test_and_clear_bit(i, owned) || boot)
+				print_update("SHD", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+			continue;
+		}
+
+		val |= CMCI_EN | CMCI_THRESHOLD;
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Did the enable bit stick? -- the bank supports CMCI */
+		if (val & CMCI_EN) {
+			if (!test_and_set_bit(i, owned) || boot)
+				print_update("CMCI", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+		} else {
+			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+		}
+	}
+	spin_unlock(&cmci_discover_lock);
+	if (hdr)
+		printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+__cpuinit void cmci_recheck(void)
+{
+	unsigned long flags;
+	int banks;
+
+	if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+		return;
+	local_irq_save(flags);
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void __cpuexit cmci_clear(void)
+{
+	int i;
+	int banks;
+	u64 val;
+
+	if (!cmci_supported(&banks))
+		return;
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+			continue;
+		/* Disable CMCI */
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		__clear_bit(i, __get_cpu_var(mce_banks_owned));
+	}
+	spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void __cpuexit cmci_rediscover(int dying)
+{
+	int banks;
+	int cpu;
+	cpumask_var_t old;
+
+	if (!cmci_supported(&banks))
+		return;
+	if (!alloc_cpumask_var(&old, GFP_KERNEL))
+		return;
+	cpumask_copy(old, &current->cpus_allowed);
+
+	for_each_online_cpu (cpu) {
+		if (cpu == dying)
+			continue;
+		if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+			continue;
+		/* Recheck banks in case CPUs don't all have the same */
+		if (cmci_supported(&banks))
+			cmci_discover(banks, 0);
+	}
+
+	set_cpus_allowed_ptr(current, old);
+	free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+	int banks;
+	if (cmci_supported(&banks))
+		cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+	int banks;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	mce_threshold_vector = intel_threshold_interrupt;
+	cmci_discover(banks, 1);
+	/*
+	 * For CPU #0 this runs with still disabled APIC, but that's
+	 * ok because only the vector is set up. We still do another
+	 * check for the banks later for CPU #0 just to make sure
+	 * to not miss any events.
+	 */
+	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+	cmci_recheck();
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
+	intel_init_cmci();
 }
-- 
cgit v1.2.3


From df20e2eb3e59b8625021a1bc8b1b53a4edc6008b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 24 Feb 2009 13:19:02 -0800
Subject: x86, mce, cmci: remove incorrect __cpuinit/__cpuexit annotations

Impact: Bug fix on UP

The MCE code is reinitialized from resume, so we can't use
__cpuinit/__cpuexit for most of the code.  Remove those annotations
for anything downstream of mce_init().

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index a518ec8c6f8..7a2e10fcfa3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -104,7 +104,7 @@ static DEFINE_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD 1
 
-static __cpuinit int cmci_supported(int *banks)
+static int cmci_supported(int *banks)
 {
 	u64 cap;
 
@@ -147,7 +147,7 @@ static void print_update(char *type, int *hdr, int num)
  * on this CPU. Use the algorithm recommended in the SDM to discover shared
  * banks.
  */
-static __cpuinit void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks, int boot)
 {
 	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
 	int hdr = 0;
@@ -192,7 +192,7 @@ static __cpuinit void cmci_discover(int banks, int boot)
  * Just in case we missed an event during initialization check
  * all the CMCI owned banks.
  */
-__cpuinit void cmci_recheck(void)
+void cmci_recheck(void)
 {
 	unsigned long flags;
 	int banks;
@@ -208,7 +208,7 @@ __cpuinit void cmci_recheck(void)
  * Disable CMCI on this CPU for all banks it owns when it goes down.
  * This allows other CPUs to claim the banks on rediscovery.
  */
-void __cpuexit cmci_clear(void)
+void cmci_clear(void)
 {
 	int i;
 	int banks;
@@ -233,7 +233,7 @@ void __cpuexit cmci_clear(void)
  * After a CPU went down cycle through all the others and rediscover
  * Must run in process context.
  */
-void __cpuexit cmci_rediscover(int dying)
+void cmci_rediscover(int dying)
 {
 	int banks;
 	int cpu;
-- 
cgit v1.2.3


From 5ca8681ca10f671427710f4954644359856581a3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:37 +0100
Subject: x86, mce, cmci: disable CMCI on rebooting

Impact: Avoids confusing other OSes.

Disable the CMCI vector on reboot to avoid confusing other OS.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 570f36e44e5..648676f0b50 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -868,6 +868,14 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6) {
+		v = apic_read(APIC_LVTCMCI);
+		if (!(v & APIC_LVT_MASKED))
+			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+	}
+#endif
+
 	/*
 	 * Clean APIC state for other OSs:
 	 */
-- 
cgit v1.2.3


From be71b8553d0522aba535a815baaebb1f0bb9a9ec Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 12 Feb 2009 13:49:38 +0100
Subject: x86, mce, cmci: recheck CMCI banks after APIC has been enabled on CPU
 #0

Impact: Fix marginal race condition

One the first CPU the machine checks are enabled early before
the local APIC is enabled. This could in theory lead
to some lost CMCI events very early during boot because
CMCIs cannot be delivered with disabled LAPIC.

The poller also doesn't recover from this because it doesn't
check CMCI banks.

Add an explicit CMCI banks check after the LAPIC is enabled.
This is only done for CPU #0, the other CPUs only initialize
machine checks after the LAPIC is on.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 648676f0b50..57b53774d98 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -48,6 +48,7 @@
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
@@ -1270,6 +1271,12 @@ void __cpuinit setup_local_APIC(void)
 	apic_write(APIC_LVT1, value);
 
 	preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+	/* Recheck CMCI information after local APIC is up on CPU #0 */
+	if (smp_processor_id() == 0)
+		cmci_recheck();
+#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
-- 
cgit v1.2.3


From 73af76dfd1f998dba71d8e8e785cbe77a990bf17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 4 Mar 2009 11:47:17 +0100
Subject: x86, mce: fix build failure in arch/x86/kernel/cpu/mcheck/threshold.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: build fix

The APIC code rewrite in the x86 tree broke the x86/mce branch:

 arch/x86/kernel/cpu/mcheck/threshold.c: In function ‘mce_threshold_interrupt’:
 arch/x86/kernel/cpu/mcheck/threshold.c:24: error: implicit declaration of function ‘ack_APIC_irq’

Also tidy up the file a bit while at it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/threshold.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index e4b8a3833fc..23ee9e730f7 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -1,9 +1,13 @@
-/* Common corrected MCE threshold handler code */
-#include <linux/kernel.h>
+/*
+ * Common corrected MCE threshold handler code:
+ */
 #include <linux/interrupt.h>
-#include <asm/mce.h>
+#include <linux/kernel.h>
+
 #include <asm/irq_vectors.h>
+#include <asm/apic.h>
 #include <asm/idle.h>
+#include <asm/mce.h>
 
 static void default_threshold_interrupt(void)
 {
-- 
cgit v1.2.3


From acaabe795a62bba089c185917af86b44654313dc Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 12:56:05 -0600
Subject: x86: UV, SGI RTC: add generic system vector

This patch allocates a system interrupt vector for various platform
specific uses.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090304185605.GA24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S   |  2 ++
 arch/x86/kernel/irq.c        | 34 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/irqinit_32.c |  3 +++
 arch/x86/kernel/irqinit_64.c |  3 +++
 4 files changed, 42 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b946..7ba4621c0df 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+	generic_interrupt smp_generic_interrupt
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aa..b864341dcc4 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
 
 atomic_t irq_err_count;
 
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
 #endif
+	if (generic_interrupt_extension) {
+		seq_printf(p, "PLT: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+		seq_printf(p, "  Platform interrupts\n");
+	}
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
 	for_each_online_cpu(j)
@@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 #endif
+	if (generic_interrupt_extension)
+		sum += irq_stats(cpu)->generic_irqs;
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	exit_idle();
+
+	irq_enter();
+
+	inc_irq_stat(generic_irqs);
+
+	if (generic_interrupt_extension)
+		generic_interrupt_extension();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-- 
cgit v1.2.3


From 5ab5ab34498f94d60884c4ccea890601e429042e Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 12:59:18 -0600
Subject: x86: UV, SGI RTC: add UV RTC clocksource/clockevents

This patch provides a high resolution clock/timer source using the
SGI UV system-wide synchronized RTC clock/timer hardware.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
LKML-Reference: <20090304185918.GC24419@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile  |   2 +-
 arch/x86/kernel/uv_time.c | 391 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 392 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/uv_time.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf..339ce35648e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..6f8e3256ab2
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,391 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+#define RTC_NAME		"sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+				struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+	.name		= RTC_NAME,
+	.rating		= 400,
+	.read		= uv_read_rtc,
+	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+	.name		= RTC_NAME,
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 20,
+	.rating		= 400,
+	.irq		= -1,
+	.set_next_event	= uv_rtc_next_event,
+	.set_mode	= uv_rtc_timer_setup,
+	.event_handler	= NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+	spinlock_t	lock;
+	/* next cpu waiting for timer, local node relative: */
+	int		next_cpu;
+	/* number of cpus on this node: */
+	int		ncpus;
+	struct {
+		int	lcpu;		/* systemwide logical cpu number */
+		u64	expires;	/* next timer expiration for this cpu */
+	} cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head		**blade_info __read_mostly;
+
+static int				uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+	unsigned long apicid, val;
+	int pnode;
+
+	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	pnode = uv_apicid_to_pnode(apicid);
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+		UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+	u64 val;
+	int pnode = uv_cpu_to_pnode(cpu);
+
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+		UVH_RTC1_INT_CONFIG_M_MASK);
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+		UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+	/* Set configuration */
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+	/* Initialize comparator value */
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+	int bid;
+
+	for_each_possible_blade(bid) {
+		kfree(blade_info[bid]);
+	}
+	kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+	int cpu;
+
+	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+	if (!blade_info)
+		return -ENOMEM;
+	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int bid = uv_cpu_to_blade_id(cpu);
+		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		struct uv_rtc_timer_head *head = blade_info[bid];
+
+		if (!head) {
+			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+				(uv_blade_nr_possible_cpus(bid) *
+					2 * sizeof(u64)),
+				GFP_KERNEL, nid);
+			if (!head) {
+				uv_rtc_deallocate_timers();
+				return -ENOMEM;
+			}
+			spin_lock_init(&head->lock);
+			head->ncpus = uv_blade_nr_possible_cpus(bid);
+			head->next_cpu = -1;
+			blade_info[bid] = head;
+		}
+
+		head->cpu[bcpu].lcpu = cpu;
+		head->cpu[bcpu].expires = ULLONG_MAX;
+	}
+
+	return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+	u64 lowest = ULLONG_MAX;
+	int c, bcpu = -1;
+
+	head->next_cpu = -1;
+	for (c = 0; c < head->ncpus; c++) {
+		u64 exp = head->cpu[c].expires;
+		if (exp < lowest) {
+			bcpu = c;
+			lowest = exp;
+		}
+	}
+	if (bcpu >= 0) {
+		head->next_cpu = bcpu;
+		c = head->cpu[bcpu].lcpu;
+		if (uv_setup_intr(c, lowest))
+			/* If we didn't set it up in time, trigger */
+			uv_rtc_send_IPI(c);
+	} else {
+		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+			UVH_RTC1_INT_CONFIG_M_MASK);
+	}
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int next_cpu;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	next_cpu = head->next_cpu;
+	*t = expires;
+	/* Will this one be next to go off? */
+	if (next_cpu < 0 || bcpu == next_cpu ||
+			expires < head->cpu[next_cpu].expires) {
+		head->next_cpu = bcpu;
+		if (uv_setup_intr(cpu, expires)) {
+			*t = ULLONG_MAX;
+			uv_rtc_find_next_timer(head, pnode);
+			spin_unlock_irqrestore(&head->lock, flags);
+			return 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+	return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+		rc = 1;
+
+	*t = ULLONG_MAX;
+
+	/* Was the hardware setup for this timer? */
+	if (head->next_cpu == bcpu)
+		uv_rtc_find_next_timer(head, pnode);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+	return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+			     struct clock_event_device *ced)
+{
+	int ced_cpu = cpumask_first(ced->cpumask);
+
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	int ced_cpu = cpumask_first(evt->cpumask);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here yet */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		uv_rtc_unset_timer(ced_cpu);
+		break;
+	}
+}
+
+static void uv_rtc_interrupt(void)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+	int cpu = smp_processor_id();
+
+	if (!ced || !ced->event_handler)
+		return;
+
+	if (uv_rtc_unset_timer(cpu) != 1)
+		return;
+
+	ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+	uv_rtc_enable = 1;
+
+	return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+	*ced = clock_event_device_uv;
+	ced->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+	int rc;
+
+	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+		return -ENODEV;
+
+	generic_interrupt_extension = uv_rtc_interrupt;
+
+	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+				clocksource_uv.shift);
+
+	rc = clocksource_register(&clocksource_uv);
+	if (rc) {
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	/* Setup and register clockevents */
+	rc = uv_rtc_allocate_timers();
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+				NSEC_PER_SEC, clock_event_device_uv.shift);
+
+	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+						sn_rtc_cycles_per_second;
+
+	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		uv_rtc_deallocate_timers();
+	}
+
+	return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
-- 
cgit v1.2.3


From 8d4dd919b46ed982da6ef6bf6fcec454cd7a5b1b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:25:21 -0800
Subject: x86: ioremap mptable

Impact: fix boot with mptable above max_low_mapped

Try to use early_ioremap() to map MPC to make sure it works even it is
at the end of ram.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE4901.3090801@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-and-tested-by: Kevin O'Connor <kevin@koconnor.net>
---
 arch/x86/kernel/mpparse.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..ae9060cb448 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct mpf_intel *mpf_found;
 
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
+		struct mpc_table *mpc;
+		unsigned long size;
 
+		size = get_mpc_size(mpf->physptr);
+		mpc = early_ioremap(mpf->physptr, size);
 		/*
 		 * Read the physical hardware table.  Anything here will
 		 * override the defaults.
 		 */
-		if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
+		if (!smp_read_mpc(mpc, early)) {
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
 #endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
 			       "(tell your hw vendor)\n");
+			early_iounmap(mpc, size);
 			return;
 		}
+		early_iounmap(mpc, size);
 
 		if (early)
 			return;
-- 
cgit v1.2.3


From f62432395ec54e93f113091bcb2e2017eeed7683 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 4 Mar 2009 01:25:54 -0800
Subject: x86: reserve exact size of mptable

Impact: save a bit of RAM

Get the exact size for the reserve_bootmem() call.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49AE4922.605@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ae9060cb448..e8192401da4 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -716,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
 					BOOTMEM_DEFAULT);
 			if (mpf->physptr) {
-				unsigned long size = PAGE_SIZE;
+				unsigned long size = get_mpc_size(mpf->physptr);
 #ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
-- 
cgit v1.2.3


From 1400b3faab8fedfffde5a7fe47098e2732d4aa76 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 4 Mar 2009 16:02:46 -0600
Subject: x86: UV, SGI RTC: fix uv_time.c for UP

Fix non-smp build of uv_time.c.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20090304220246.GC6288@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_time.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 6f8e3256ab2..2ffb6c53326 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -24,6 +24,8 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
 
 #define RTC_NAME		"sgi_rtc"
 
@@ -84,7 +86,7 @@ static void uv_rtc_send_IPI(int cpu)
 	unsigned long apicid, val;
 	int pnode;
 
-	apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	apicid = cpu_physical_id(cpu);
 	pnode = uv_apicid_to_pnode(apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-- 
cgit v1.2.3


From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:16:51 -0800
Subject: x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state
 tests

Impact: cleanup, micro-optimization

Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default
to remove the use of system_state tests in __virt_addr_valid()
and __phys_addr().

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b746deb9ebc..f28c56e6bf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
-- 
cgit v1.2.3


From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 7 Mar 2009 23:46:26 -0800
Subject: x86: remove smp_apply_quirks()/smp_checks()

Impact: cleanup and code size reduction on 64-bit

This code is only applied to Intel Pentium and AMD K7 32-bit cpus.

Move those checks to intel_init()/amd_init() for 32-bit
so 64-bit will not build this code.

Also change to use cpu_index check to see if we need to emit warning.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B377D2.8030108@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c   | 52 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel.c | 25 +++++++++++++++
 arch/x86/kernel/smpboot.c   | 78 ---------------------------------------------
 3 files changed, 77 insertions(+), 78 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..f47df59016c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	/* Athlon 660/661 is valid. */
+	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+	    (c->x86_mask == 1)))
+		goto valid_k7;
+
+	/* Duron 670 is valid */
+	if ((c->x86_model == 7) && (c->x86_mask == 0))
+		goto valid_k7;
+
+	/*
+	 * Athlon 662, Duron 671, and Athlon >model 7 have capability
+	 * bit. It's worth noting that the A5 stepping (662) of some
+	 * Athlon XP's have the MP bit set.
+	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+	 * more.
+	 */
+	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	     (c->x86_model > 7))
+		if (cpu_has_mp)
+			goto valid_k7;
+
+	/* If we get here, not a certified SMP capable AMD system. */
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	WARN_ONCE(1, "WARNING: This combination of AMD"
+		"processors is not suitable for SMP.\n");
+	if (!test_taint(TAINT_UNSAFE_SMP))
+		add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+	;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 	}
 
 	set_cpu_cap(c, X86_FEATURE_K7);
+
+	amd_k7_smp_check(c);
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d5..191117f1ad5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3) {
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+				    "with B stepping processors.\n");
+	}
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
@@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 #endif
+
+	intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
 /* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
 	cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
-	/*
-	 * Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		/*
-		 * Remember we have B step Pentia with bugs
-		 */
-		smp_b_stepping = 1;
-
-	/*
-	 * Certain Athlons might work (for various values of 'work') in SMP
-	 * but they are not certified as MP capable.
-	 */
-	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-		if (num_possible_cpus() == 1)
-			goto valid_k7;
-
-		/* Athlon 660/661 is valid. */
-		if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-		    (c->x86_mask == 1)))
-			goto valid_k7;
-
-		/* Duron 670 is valid */
-		if ((c->x86_model == 7) && (c->x86_mask == 0))
-			goto valid_k7;
-
-		/*
-		 * Athlon 662, Duron 671, and Athlon >model 7 have capability
-		 * bit. It's worth noting that the A5 stepping (662) of some
-		 * Athlon XP's have the MP bit set.
-		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-		 * more.
-		 */
-		if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-		    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-		     (c->x86_model > 7))
-			if (cpu_has_mp)
-				goto valid_k7;
-
-		/* If we get here, not a certified SMP capable AMD system. */
-		unsafe_smp = 1;
-	}
-
-valid_k7:
-	;
-}
-
-static void __cpuinit smp_checks(void)
-{
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-				    "with B stepping processors.\n");
-
-	/*
-	 * Don't taint if we are running SMP kernel on a single non-MP
-	 * approved Athlon
-	 */
-	if (unsafe_smp && num_online_cpus() > 1) {
-		printk(KERN_INFO "WARNING: This combination of AMD"
-			"processors is not suitable for SMP.\n");
-		add_taint(TAINT_UNSAFE_SMP);
-	}
-}
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
-	smp_apply_quirks(c);
 }
 
 
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done.\n");
 
 	impress_friends();
-	smp_checks();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
-- 
cgit v1.2.3


From 8c5dfd25519bf302ba43daa59976c4d675a594a7 Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Tue, 10 Mar 2009 00:10:32 -0500
Subject: x86: BUG to BUG_ON changes

Impact: cleanup

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
LKML-Reference: <1236661850-8237-8-git-send-email-stoyboyker@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 6 ++----
 arch/x86/kernel/quirks.c     | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 826d5c87627..f8869978bbb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1078,8 +1078,7 @@ void __cpuinit cpu_init(void)
 
 	atomic_inc(&init_mm.mm_count);
 	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
+	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
 
 	load_sp0(t, &current->thread);
@@ -1145,8 +1144,7 @@ void __cpuinit cpu_init(void)
 	 */
 	atomic_inc(&init_mm.mm_count);
 	curr->active_mm = &init_mm;
-	if (curr->mm)
-		BUG();
+	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
 
 	load_sp0(t, thread);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c..6a5a2970f4c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
 	if (!force_hpet_address)
 		return;
 
-	if (rcba_base == NULL)
-		BUG();
+	BUG_ON(rcba_base == NULL);
 
 	/* read the Function Disable register, dword mode only */
 	val = readl(rcba_base + 0x3404);
-- 
cgit v1.2.3


From 9b779edf4b97798d037bb44fca2719ac184d0f14 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Tue, 10 Mar 2009 15:37:51 +0530
Subject: x86: cpu architecture debug code

Introduce:

 cat /sys/kernel/debug/x86/cpu/*

for Intel and AMD processors to view / debug the state of each CPU.

By using this we can debug whole range of registers and other
cpu information for debugging purpose and monitor how things
are changing.

This can be useful for developers as well as for users.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1236701373.3387.4.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile    |   2 +
 arch/x86/kernel/cpu/cpu_debug.c | 784 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 786 insertions(+)
 create mode 100755 arch/x86/kernel/cpu/cpu_debug.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2d..d4356f8b752 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,6 +14,8 @@ obj-y			+= vmware.o hypervisor.o
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
 
+obj-$(CONFIG_X86_CPU_DEBUG)		+= cpu_debug.o
+
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..0bdf4daba20
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,784 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+	{ "mc",		CPU_MC		},	/* Machine Check	*/
+	{ "monitor",	CPU_MONITOR	},	/* Monitor		*/
+	{ "time",	CPU_TIME	},	/* Time			*/
+	{ "pmc",	CPU_PMC		},	/* Performance Monitor	*/
+	{ "platform",	CPU_PLATFORM	},	/* Platform		*/
+	{ "apic",	CPU_APIC	},	/* APIC			*/
+	{ "poweron",	CPU_POWERON	},	/* Power-on		*/
+	{ "control",	CPU_CONTROL	},	/* Control		*/
+	{ "features",	CPU_FEATURES	},	/* Features control	*/
+	{ "lastbranch",	CPU_LBRANCH	},	/* Last Branch		*/
+	{ "bios",	CPU_BIOS	},	/* BIOS			*/
+	{ "freq",	CPU_FREQ	},	/* Frequency		*/
+	{ "mtrr",	CPU_MTRR	},	/* MTRR			*/
+	{ "perf",	CPU_PERF	},	/* Performance		*/
+	{ "cache",	CPU_CACHE	},	/* Cache		*/
+	{ "sysenter",	CPU_SYSENTER	},	/* Sysenter		*/
+	{ "therm",	CPU_THERM	},	/* Thermal		*/
+	{ "misc",	CPU_MISC	},	/* Miscellaneous	*/
+	{ "debug",	CPU_DEBUG	},	/* Debug		*/
+	{ "pat",	CPU_PAT		},	/* PAT			*/
+	{ "vmx",	CPU_VMX		},	/* VMX			*/
+	{ "call",	CPU_CALL	},	/* System Call		*/
+	{ "base",	CPU_BASE	},	/* BASE Address		*/
+	{ "smm",	CPU_SMM		},	/* System mgmt mode	*/
+	{ "svm",	CPU_SVM		},	/*Secure Virtial Machine*/
+	{ "osvm",	CPU_OSVM	},	/* OS-Visible Workaround*/
+	{ "tss",	CPU_TSS		},	/* Task Stack Segment	*/
+	{ "cr",		CPU_CR		},	/* Control Registers	*/
+	{ "dt",		CPU_DT		},	/* Descriptor Table	*/
+	{ "registers",	CPU_REG_ALL	},	/* Select all Registers	*/
+};
+
+static struct cpu_file_base cpu_file[] = {
+	{ "index",	CPU_REG_ALL	},	/* index		*/
+	{ "value",	CPU_REG_ALL	},	/* value		*/
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		},
+	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		},
+	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		},
+	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	},
+	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		},
+
+	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		},
+	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		},
+	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		},
+	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		},
+
+	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		},
+	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		},
+	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		},
+	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		},
+
+	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		},
+	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		},
+	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		},
+
+	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		},
+	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		},
+	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		},
+
+	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		},
+	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		},
+	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		},
+
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		},
+	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		},
+	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		},
+	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		},
+	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		},
+	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		},
+	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		},
+	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		},
+
+	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		},
+	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		},
+	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		},
+	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		},
+	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		},
+	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		},
+	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		},
+
+	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		},
+	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		},
+	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		},
+	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		},
+	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		},
+	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		},
+
+	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		},
+	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		},
+	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		},
+	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		},
+	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		},
+
+	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		},
+	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		},
+	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		},
+
+	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		},
+	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		},
+	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		},
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_ALL,		},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_ALL,		},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_ALL,		},
+
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_ALL,		},
+	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_ALL,		},
+	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_ALL,		},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_ALL,		},
+	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_ALL,		},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_ALL,		},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_ALL,		},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_ALL,		},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_ALL,		},
+	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_ALL,		},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_ALL,		},
+
+	{ 0x00000400, 0x00000417, CPU_MC,	CPU_ALL,		},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_ALL,		},
+	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_ALL,		},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_ALL,		},
+	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_ALL,		},
+
+	{ 0xC0000408, 0xC000040A, CPU_MC,	CPU_ALL,		},
+
+	{ 0xc0010000, 0xc0010007, CPU_PMC,	CPU_ALL,		},
+	{ 0xc0010010, 0xc0010010, CPU_MTRR,	CPU_ALL,		},
+	{ 0xc0010016, 0xc001001A, CPU_MTRR,	CPU_ALL,		},
+	{ 0xc001001D, 0xc001001D, CPU_MTRR,	CPU_ALL,		},
+	{ 0xc0010030, 0xc0010035, CPU_BIOS,	CPU_ALL,		},
+	{ 0xc0010056, 0xc0010056, CPU_SMM,	CPU_ALL,		},
+	{ 0xc0010061, 0xc0010063, CPU_SMM,	CPU_ALL,		},
+	{ 0xc0010074, 0xc0010074, CPU_MC,	CPU_ALL,		},
+	{ 0xc0010111, 0xc0010113, CPU_SMM,	CPU_ALL,		},
+	{ 0xc0010114, 0xc0010118, CPU_SVM,	CPU_ALL,		},
+	{ 0xc0010119, 0xc001011A, CPU_SMM,	CPU_ALL,		},
+	{ 0xc0010140, 0xc0010141, CPU_OSVM,	CPU_ALL,		},
+	{ 0xc0010156, 0xc0010156, CPU_SMM,	CPU_ALL,		},
+};
+
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+	int flag;
+
+	switch (per_cpu(cpu_model, cpu)) {
+	/* Intel */
+	case 0x0501:
+	case 0x0502:
+	case 0x0504:
+		flag = CPU_INTEL_PENTIUM;
+		break;
+	case 0x0601:
+	case 0x0603:
+	case 0x0605:
+	case 0x0607:
+	case 0x0608:
+	case 0x060A:
+	case 0x060B:
+		flag = CPU_INTEL_P6;
+		break;
+	case 0x0609:
+	case 0x060D:
+		flag = CPU_INTEL_PENTIUM_M;
+		break;
+	case 0x060E:
+		flag = CPU_INTEL_CORE;
+		break;
+	case 0x060F:
+	case 0x0617:
+		flag = CPU_INTEL_CORE2;
+		break;
+	case 0x061C:
+		flag = CPU_INTEL_ATOM;
+		break;
+	case 0x0F00:
+	case 0x0F01:
+	case 0x0F02:
+	case 0x0F03:
+	case 0x0F04:
+		flag = CPU_INTEL_XEON_P4;
+		break;
+	case 0x0F06:
+		flag = CPU_INTEL_XEON_MP;
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+	int index;
+
+	switch (per_cpu(cpu_model, cpu) >> 16) {
+	case X86_VENDOR_INTEL:
+		index = ARRAY_SIZE(cpu_intel_range);
+		break;
+	case X86_VENDOR_AMD:
+		index = ARRAY_SIZE(cpu_amd_range);
+		break;
+	default:
+		index = 0;
+		break;
+	}
+
+	return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+	unsigned vendor, modelflag;
+	int i, index;
+
+	/* Standard Registers should be always valid */
+	if (flag >= CPU_TSS)
+		return 1;
+
+	modelflag = per_cpu(cpu_modelflag, cpu);
+	vendor = per_cpu(cpu_model, cpu) >> 16;
+	index = get_cpu_range_count(cpu);
+
+	for (i = 0; i < index; i++) {
+		switch (vendor) {
+		case X86_VENDOR_INTEL:
+			if ((cpu_intel_range[i].model & modelflag) &&
+			    (cpu_intel_range[i].flag & flag))
+				return 1;
+			break;
+		case X86_VENDOR_AMD:
+			if (cpu_amd_range[i].flag & flag)
+				return 1;
+			break;
+		}
+	}
+
+	/* Invalid */
+	return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+			      int index, unsigned flag)
+{
+	unsigned modelflag;
+
+	modelflag = per_cpu(cpu_modelflag, cpu);
+	*max = 0;
+	switch (per_cpu(cpu_model, cpu) >> 16) {
+	case X86_VENDOR_INTEL:
+		if ((cpu_intel_range[index].model & modelflag) &&
+		    (cpu_intel_range[index].flag & flag)) {
+			*min = cpu_intel_range[index].min;
+			*max = cpu_intel_range[index].max;
+		}
+		break;
+	case X86_VENDOR_AMD:
+		if (cpu_amd_range[index].flag & flag) {
+			*min = cpu_amd_range[index].min;
+			*max = cpu_amd_range[index].max;
+		}
+		break;
+	}
+
+	return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+			   u32 low, u32 high)
+{
+	struct cpu_private *priv;
+	u64 val = high;
+
+	if (seq) {
+		priv = seq->private;
+		if (priv->file) {
+			val = (val << 32) | low;
+			seq_printf(seq, "0x%llx\n", val);
+		} else
+			seq_printf(seq, " %08x: %08x_%08x\n",
+				   type, high, low);
+	} else
+		printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+	unsigned msr, msr_min, msr_max;
+	struct cpu_private *priv;
+	u32 low, high;
+	int i, range;
+
+	if (seq) {
+		priv = seq->private;
+		if (priv->file) {
+			if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+					       &low, &high))
+				print_cpu_data(seq, priv->reg, low, high);
+			return;
+		}
+	}
+
+	range = get_cpu_range_count(cpu);
+
+	for (i = 0; i < range; i++) {
+		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+			continue;
+
+		for (msr = msr_min; msr <= msr_max; msr++) {
+			if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+				continue;
+			print_cpu_data(seq, msr, low, high);
+		}
+	}
+}
+
+static void print_tss(void *arg)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct seq_file *seq = arg;
+	unsigned int seg;
+
+	seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+	seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+	seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+	seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+	seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+	seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+	seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+	seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+	seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+	seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+	seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+	seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+	seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+	seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+	seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+	seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+	asm("movl %%cs,%0" : "=r" (seg));
+	seq_printf(seq, " CS\t:             %04x\n", seg);
+	asm("movl %%ds,%0" : "=r" (seg));
+	seq_printf(seq, " DS\t:             %04x\n", seg);
+	seq_printf(seq, " SS\t:             %04lx\n", regs->ss);
+	asm("movl %%es,%0" : "=r" (seg));
+	seq_printf(seq, " ES\t:             %04x\n", seg);
+	asm("movl %%fs,%0" : "=r" (seg));
+	seq_printf(seq, " FS\t:             %04x\n", seg);
+	asm("movl %%gs,%0" : "=r" (seg));
+	seq_printf(seq, " GS\t:             %04x\n", seg);
+
+	seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+	seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+	struct seq_file *seq = arg;
+
+	seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+	seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+	seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+	seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+	seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+	seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+	struct desc_ptr dt;
+	unsigned long ldt;
+
+	/* IDT */
+	store_idt((struct desc_ptr *)&dt);
+	print_desc_ptr("IDT", seq, dt);
+
+	/* GDT */
+	store_gdt((struct desc_ptr *)&dt);
+	print_desc_ptr("GDT", seq, dt);
+
+	/* LDT */
+	store_ldt(ldt);
+	seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+	/* TR */
+	store_tr(ldt);
+	seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+	struct seq_file *seq = arg;
+	unsigned long dr;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+		get_debugreg(dr, i);
+		seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+	}
+
+	seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+	struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	seq_printf(seq, " LAPIC\t:\n");
+	seq_printf(seq, " ID\t\t: %08x\n",  apic_read(APIC_ID) >> 24);
+	seq_printf(seq, " LVR\t\t: %08x\n",  apic_read(APIC_LVR));
+	seq_printf(seq, " TASKPRI\t: %08x\n",  apic_read(APIC_TASKPRI));
+	seq_printf(seq, " ARBPRI\t\t: %08x\n",  apic_read(APIC_ARBPRI));
+	seq_printf(seq, " PROCPRI\t: %08x\n",  apic_read(APIC_PROCPRI));
+	seq_printf(seq, " LDR\t\t: %08x\n",  apic_read(APIC_LDR));
+	seq_printf(seq, " DFR\t\t: %08x\n",  apic_read(APIC_DFR));
+	seq_printf(seq, " SPIV\t\t: %08x\n",  apic_read(APIC_SPIV));
+	seq_printf(seq, " ISR\t\t: %08x\n",  apic_read(APIC_ISR));
+	seq_printf(seq, " ESR\t\t: %08x\n",  apic_read(APIC_ESR));
+	seq_printf(seq, " ICR\t\t: %08x\n",  apic_read(APIC_ICR));
+	seq_printf(seq, " ICR2\t\t: %08x\n",  apic_read(APIC_ICR2));
+	seq_printf(seq, " LVTT\t\t: %08x\n",  apic_read(APIC_LVTT));
+	seq_printf(seq, " LVTTHMR\t: %08x\n",  apic_read(APIC_LVTTHMR));
+	seq_printf(seq, " LVTPC\t\t: %08x\n",  apic_read(APIC_LVTPC));
+	seq_printf(seq, " LVT0\t\t: %08x\n",  apic_read(APIC_LVT0));
+	seq_printf(seq, " LVT1\t\t: %08x\n",  apic_read(APIC_LVT1));
+	seq_printf(seq, " LVTERR\t\t: %08x\n",  apic_read(APIC_LVTERR));
+	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
+	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
+	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+	seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct cpu_private *priv = seq->private;
+
+	if (priv == NULL)
+		return -EINVAL;
+
+	switch (cpu_base[priv->type].flag) {
+	case CPU_TSS:
+		smp_call_function_single(priv->cpu, print_tss, seq, 1);
+		break;
+	case CPU_CR:
+		smp_call_function_single(priv->cpu, print_cr, seq, 1);
+		break;
+	case CPU_DT:
+		smp_call_function_single(priv->cpu, print_dt, seq, 1);
+		break;
+	case CPU_DEBUG:
+		if (priv->file == CPU_INDEX_BIT)
+			smp_call_function_single(priv->cpu, print_dr, seq, 1);
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+	case CPU_APIC:
+		if (priv->file == CPU_INDEX_BIT)
+			smp_call_function_single(priv->cpu, print_apic, seq, 1);
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+
+	default:
+		print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+		break;
+	}
+	seq_printf(seq, "\n");
+
+	return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) /* One time is enough ;-) */
+		return seq;
+
+	return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+	.start		= cpu_seq_start,
+	.next		= cpu_seq_next,
+	.stop		= cpu_seq_stop,
+	.show		= cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+	struct cpu_private *priv = inode->i_private;
+	struct seq_file *seq;
+	int err;
+
+	err = seq_open(file, &cpu_seq_ops);
+	if (!err) {
+		seq = file->private_data;
+		seq->private = priv;
+	}
+
+	return err;
+}
+
+static const struct file_operations cpu_fops = {
+	.open		= cpu_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+			   unsigned file, struct dentry *dentry)
+{
+	struct cpu_private *priv = NULL;
+
+	/* Already intialized */
+	if (file == CPU_INDEX_BIT)
+		if (per_cpu(cpu_arr[type].init, cpu))
+			return 0;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	priv->cpu = cpu;
+	priv->type = type;
+	priv->reg = reg;
+	priv->file = file;
+	mutex_lock(&cpu_debug_lock);
+	per_cpu(priv_arr[type], cpu) = priv;
+	per_cpu(cpu_priv_count, cpu)++;
+	mutex_unlock(&cpu_debug_lock);
+
+	if (file)
+		debugfs_create_file(cpu_file[file].name, S_IRUGO,
+				    dentry, (void *)priv, &cpu_fops);
+	else {
+		debugfs_create_file(cpu_base[type].name, S_IRUGO,
+				    per_cpu(cpu_arr[type].dentry, cpu),
+				    (void *)priv, &cpu_fops);
+		mutex_lock(&cpu_debug_lock);
+		per_cpu(cpu_arr[type].init, cpu) = 1;
+		mutex_unlock(&cpu_debug_lock);
+	}
+
+	return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+			     struct dentry *dentry)
+{
+	unsigned file;
+	int err = 0;
+
+	for (file = 0; file <  ARRAY_SIZE(cpu_file); file++) {
+		err = cpu_create_file(cpu, type, reg, file, dentry);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+	struct dentry *cpu_dentry = NULL;
+	unsigned reg, reg_min, reg_max;
+	int i, range, err = 0;
+	char reg_dir[12];
+	u32 low, high;
+
+	range = get_cpu_range_count(cpu);
+
+	for (i = 0; i < range; i++) {
+		if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+				   cpu_base[type].flag))
+			continue;
+
+		for (reg = reg_min; reg <= reg_max; reg++) {
+			if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+				continue;
+
+			sprintf(reg_dir, "0x%x", reg);
+			cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+			err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+	struct dentry *cpu_dentry = NULL;
+	unsigned type;
+	int err = 0;
+
+	for (type = 0; type <  ARRAY_SIZE(cpu_base) - 1; type++) {
+		if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+			continue;
+		cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+		per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+		if (type < CPU_TSS_BIT)
+			err = cpu_init_msr(cpu, type, cpu_dentry);
+		else
+			err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+					      cpu_dentry);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int cpu_init_cpu(void)
+{
+	struct dentry *cpu_dentry = NULL;
+	struct cpuinfo_x86 *cpui;
+	char cpu_dir[12];
+	unsigned cpu;
+	int err = 0;
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+		cpui = &cpu_data(cpu);
+		if (!cpu_has(cpui, X86_FEATURE_MSR))
+			continue;
+		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+					   (cpui->x86 << 8) |
+					   (cpui->x86_model));
+		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+		sprintf(cpu_dir, "cpu%d", cpu);
+		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+		err = cpu_init_allreg(cpu, cpu_dentry);
+
+		pr_info("cpu%d(%d) debug files %d\n",
+			cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+		if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+			pr_err("Register files count %d exceeds limit %d\n",
+				per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+			per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+			err = -ENFILE;
+		}
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+	cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+	return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+	int i, cpu;
+
+	if (cpu_debugfs_dir)
+		debugfs_remove_recursive(cpu_debugfs_dir);
+
+	for (cpu = 0; cpu <  nr_cpu_ids; cpu++)
+		for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+			kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From fef3a7a17418814733ebde0b40d8e32747677c8f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:56:57 +0800
Subject: x86, kexec: fix kexec x86 coding style

Impact: Cleanup

Fix some coding style issue for kexec x86.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/machine_kexec_32.c   | 17 ++++++++++-------
 arch/x86/kernel/machine_kexec_64.c   | 15 ++++++++-------
 arch/x86/kernel/relocate_kernel_32.S | 24 ++++++++++++++++--------
 arch/x86/kernel/relocate_kernel_64.S | 24 ++++++++++++++++--------
 4 files changed, 50 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
 		"\tmovl %%eax,%%fs\n"
 		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
-		::: "eax", "memory");
+		: : : "eax", "memory");
 #undef STR
 #undef __STR
 }
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
-		/* We need to put APICs in legacy mode so that we can
+		/*
+		 * We need to put APICs in legacy mode so that we can
 		 * get timer interrupts in second kernel. kexec/kdump
 		 * paths already have calls to disable_IO_APIC() in
 		 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..f8c796fffa0 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,11 @@
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -83,9 +83,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
 		}
 		level3p = (pud_t *)page_address(page);
 		result = init_level3_page(image, level3p, addr, last_addr);
-		if (result) {
+		if (result)
 			goto out;
-		}
 		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
@@ -242,7 +241,8 @@ void machine_kexec(struct kimage *image)
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -252,11 +252,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
 
 #define PTR(x) (x << 2)
 
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
@@ -76,8 +77,10 @@ relocate_kernel:
 	movl	%eax, CP_PA_SWAP_PAGE(%edi)
 	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
 	/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushl   %edx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging disabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
 	/* clear cr4 if applicable */
 	testl	%ecx, %ecx
 	jz	1f
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 * Setting everything to zero seems safe.
 	 */
 	xorl	%eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
 	call	swap_pages
 	addl	$8, %esp
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB, it's handy, and not processor dependent.
 	 */
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %esp alone
+	 */
 
 	testl	%esi, %esi
 	jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..cfc0d24003d 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -24,7 +24,8 @@
 	.code64
 	.globl relocate_kernel
 relocate_kernel:
-	/* %rdi indirection_page
+	/*
+	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
 	 */
@@ -33,8 +34,10 @@ relocate_kernel:
 	pushq $0
 	popfq
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
@@ -55,7 +58,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushq   %rdx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging enabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -68,7 +72,8 @@ identity_mapped:
 	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
 	movq	%rax, %cr0
 
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 *  - physical address extension enabled
 	 */
 	movq	$X86_CR4_PAE, %rax
@@ -117,7 +122,8 @@ identity_mapped:
 	jmp	0b
 3:
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB by reloading %cr3 here, it's handy,
 	 * and not processor dependent.
@@ -125,8 +131,10 @@ identity_mapped:
 	movq	%cr3, %rax
 	movq	%rax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %rsp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
 
 	xorq	%rax, %rax
 	xorq	%rbx, %rbx
-- 
cgit v1.2.3


From 5359454701ce51a4626b1ef6eb7b16ec35bd458d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:57:04 +0800
Subject: x86, kexec: x86_64: add identity map for pages at image->start

Impact: Fix corner case that cannot yet occur

image->start may be outside of 0 ~ max_pfn, for example when jumping
back to original kernel from kexeced kenrel. This patch add identity
map for pages at image->start.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/machine_kexec_64.c | 42 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index f8c796fffa0..7cc5d3d0148 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,41 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	struct page *page;
+	int result = -ENOMEM;
+
+	addr &= PMD_MASK;
+	pgd += pgd_index(addr);
+	if (!pgd_present(*pgd)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pud = (pud_t *)page_address(page);
+		memset(pud, 0, PAGE_SIZE);
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pmd = (pmd_t *)page_address(page);
+		memset(pmd, 0, PAGE_SIZE);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+	result = 0;
+out:
+	return result;
+}
+
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
@@ -153,6 +188,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	int result;
 	level4p = (pgd_t *)__va(start_pgtable);
 	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+	if (result)
+		return result;
+	/*
+	 * image->start may be outside 0 ~ max_pfn, for example when
+	 * jump back to original kernel from kexeced kernel
+	 */
+	result = init_one_level2_page(image, level4p, image->start);
 	if (result)
 		return result;
 	return init_transition_pgtable(image, level4p);
-- 
cgit v1.2.3


From fee7b0d84cc8c7bc5dc212901c79e93eaf83a5b5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 10 Mar 2009 10:57:16 +0800
Subject: x86, kexec: x86_64: add kexec jump support for x86_64

Impact: New major feature

This patch add kexec jump support for x86_64. More information about
kexec jump can be found in corresponding x86_32 support patch.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/machine_kexec_64.c   |  42 ++++++++-
 arch/x86/kernel/relocate_kernel_64.S | 177 ++++++++++++++++++++++++++++-------
 arch/x86/kernel/vmlinux_64.lds.S     |   7 ++
 3 files changed, 189 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 7cc5d3d0148..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/io.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -270,19 +271,43 @@ void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	int save_ftrace_enabled;
 
-	tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		save_processor_state();
+#endif
+
+	save_ftrace_enabled = __ftrace_enabled_save();
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/*
+		 * We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
 	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+						<< PAGE_SHIFT);
+
 	/*
 	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -302,8 +327,17 @@ void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start);
+	image->start = relocate_kernel((unsigned long)image->head,
+				       (unsigned long)page_list,
+				       image->start,
+				       image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		restore_processor_state();
+#endif
+
+	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index cfc0d24003d..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,6 +19,24 @@
 #define PTR(x) (x << 3)
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP			DATA(0x0)
+#define CR0			DATA(0x8)
+#define CR3			DATA(0x10)
+#define CR4			DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE	DATA(0x20)
+#define CP_PA_SWAP_PAGE		DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
+
 	.text
 	.align PAGE_SIZE
 	.code64
@@ -28,8 +46,27 @@ relocate_kernel:
 	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
+	 * %rcx preserve_context
 	 */
 
+	/* Save the CPU context, used for jumping back */
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushf
+
+	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
+	movq	%rsp, RSP(%r11)
+	movq	%cr0, %rax
+	movq	%rax, CR0(%r11)
+	movq	%cr3, %rax
+	movq	%rax, CR3(%r11)
+	movq	%cr4, %rax
+	movq	%rax, CR4(%r11)
+
 	/* zero out flags, and disable interrupts */
 	pushq $0
 	popfq
@@ -41,10 +78,18 @@ relocate_kernel:
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
-	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx
+	movq	PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+	/* get physical address of swap page now */
+	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+	/* save some information for jumping back */
+	movq	%r9, CP_PA_TABLE_PAGE(%r11)
+	movq	%r10, CP_PA_SWAP_PAGE(%r11)
+	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
 
 	/* Switch to the identity mapped page tables */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
 
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
@@ -83,9 +128,87 @@ identity_mapped:
 1:
 
 	/* Flush the TLB (needed?) */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
+
+	movq	%rcx, %r11
+	call	swap_pages
+
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
+
+	testq	%r11, %r11
+	jnz 1f
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+
+1:
+	popq	%rdx
+	leaq	PAGE_SIZE(%r10), %rsp
+	call	*%rdx
+
+	/* get the re-entry point of the peer system */
+	movq	0(%rsp), %rbp
+	call	1f
+1:
+	popq	%r8
+	subq	$(1b - relocate_kernel), %r8
+	movq	CP_PA_SWAP_PAGE(%r8), %r10
+	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+	movq	CP_PA_TABLE_PAGE(%r8), %rax
+	movq	%rax, %cr3
+	lea	PAGE_SIZE(%r8), %rsp
+	call	swap_pages
+	movq	$virtual_mapped, %rax
+	pushq	%rax
+	ret
+
+virtual_mapped:
+	movq	RSP(%r8), %rsp
+	movq	CR4(%r8), %rax
+	movq	%rax, %cr4
+	movq	CR3(%r8), %rax
+	movq	CR0(%r8), %r8
+	movq	%rax, %cr3
+	movq	%r8, %cr0
+	movq	%rbp, %rax
+
+	popf
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	popq	%rbx
+	ret
 
 	/* Do the copies */
+swap_pages:
 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
 	xorq	%rdi, %rdi
 	xorq	%rsi, %rsi
@@ -117,39 +240,27 @@ identity_mapped:
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
 
+	movq	%rdi, %rdx
+	movq	%rsi, %rax
+
+	movq	%r10, %rdi
 	movq	$512,   %rcx
 	rep ; movsq
-	jmp	0b
-3:
 
-	/*
-	 * To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB by reloading %cr3 here, it's handy,
-	 * and not processor dependent.
-	 */
-	movq	%cr3, %rax
-	movq	%rax, %cr3
-
-	/*
-	 * set all of the registers to known values
-	 * leave %rsp alone
-	 */
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
-	xorq	%rax, %rax
-	xorq	%rbx, %rbx
-	xorq    %rcx, %rcx
-	xorq    %rdx, %rdx
-	xorq    %rsi, %rsi
-	xorq    %rdi, %rdi
-	xorq    %rbp, %rbp
-	xorq	%r8,  %r8
-	xorq	%r9,  %r9
-	xorq	%r10, %r9
-	xorq	%r11, %r11
-	xorq	%r12, %r12
-	xorq	%r13, %r13
-	xorq	%r14, %r14
-	xorq	%r15, %r15
+	movq	%rdx, %rdi
+	movq	%r10, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
+	lea	PAGE_SIZE(%rax), %rsi
+	jmp	0b
+3:
 	ret
+
+	.globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..5bf54e40c6e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
 ASSERT((per_cpu__irq_stack_union == 0),
         "irq_stack_union is not at start of per-cpu area");
 #endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
-- 
cgit v1.2.3


From 5490fa96735ce0e2af270c0868987d644b9a38ec Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 11 Mar 2009 10:14:26 +0900
Subject: x86, mce: use round_jiffies() instead round_jiffies_relative()

Impact: saving power _very_ little

round_jiffies() round up absolute jiffies to full second.
round_jiffies_relative() round up relative jiffies to full second.

The "t->expires" is absolute jiffies. Then, round_jiffies() should be
used instead round_jiffies_relative().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index bfbd5323a63..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -639,7 +639,7 @@ static void mce_init_timer(void)
 	if (!next_interval)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies_relative(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + next_interval);
 	add_timer(t);
 }
 
@@ -1110,7 +1110,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies_relative(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies + next_interval);
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;
-- 
cgit v1.2.3


From bf5172d07ac38e538e01143289e9b46076494ad5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Mar 2009 22:04:45 +0100
Subject: x86: convert obsolete irq_desc_t typedef to struct irq_desc

Impact: cleanup

Convert the last remaining users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/visws_quirks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 191a876e9e8..31ffc24eec4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
 static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 {
 	int realirq;
-	irq_desc_t *desc;
+	struct irq_desc *desc;
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8259A_lock, flags);
-- 
cgit v1.2.3


From 8229d754383e8cd905c38b56bd7365c7fc10dfc1 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 11 Mar 2009 19:13:49 +0530
Subject: x86: cpu architecture debug code, build fix, cleanup

move store_ldt outside the CONFIG_PARAVIRT section and
also clean up the code a bit.

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpu_debug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 0bdf4daba20..9abbcbd933c 100755
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -23,6 +23,7 @@
 #include <linux/smp.h>
 
 #include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
 #include <asm/system.h>
 #include <asm/traps.h>
 #include <asm/apic.h>
@@ -427,7 +428,7 @@ static void print_tss(void *arg)
 	seq_printf(seq, " CS\t:             %04x\n", seg);
 	asm("movl %%ds,%0" : "=r" (seg));
 	seq_printf(seq, " DS\t:             %04x\n", seg);
-	seq_printf(seq, " SS\t:             %04lx\n", regs->ss);
+	seq_printf(seq, " SS\t:             %04lx\n", regs->ss & 0xffff);
 	asm("movl %%es,%0" : "=r" (seg));
 	seq_printf(seq, " ES\t:             %04x\n", seg);
 	asm("movl %%fs,%0" : "=r" (seg));
-- 
cgit v1.2.3


From 9fa7266c2f0cdfcb09eacba2d3624bec7df78641 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 10:34:45 +0000
Subject: x86: remove leftover unwind annotations

Impact: cleanup

These got left in needlessly when ret_from_fork got simplified.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B8F355.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7ba4621c0df..54866bb5d38 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -416,7 +416,6 @@ ENTRY(ret_from_fork)
 
 	GET_THREAD_INFO(%rcx)
 
-	CFI_REMEMBER_STATE
 	RESTORE_REST
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
@@ -428,7 +427,6 @@ ENTRY(ret_from_fork)
 	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
-	CFI_RESTORE_STATE
 	CFI_ENDPROC
 END(ret_from_fork)
 
-- 
cgit v1.2.3


From c2810188c1b810c68139608a207befae0a4f1e69 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 10:38:55 +0000
Subject: x86-64: move save_paranoid into .kprobes.text

Impact: mark save_paranoid as non-kprobe-able code

This appears to be necessary as the function gets called from
kprobes-unsafe exception handling stubs (i.e. which themselves
live in .kprobes.text).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B8F44F.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 54866bb5d38..a331ec38af9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
 END(save_rest)
 
 /* save complete stack frame */
+	.pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
 1:	ret
 	CFI_ENDPROC
 END(save_paranoid)
+	.popsection
 
 /*
  * A newly forked process directly context switches into this address.
-- 
cgit v1.2.3


From 02dde8b45c5460794b9052d7c12939fe3eb63c2c Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 12:08:49 +0000
Subject: x86: move various CPU initialization objects into .cpuinit.rodata

Impact: debuggability and micro-optimization

Putting whatever is possible into the (final) .rodata section increases
the likelihood of catching memory corruption bugs early, and reduces
false cache line sharing.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B90961.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c |  2 +-
 arch/x86/kernel/cpu/amd.c                  |  2 +-
 arch/x86/kernel/cpu/centaur.c              |  2 +-
 arch/x86/kernel/cpu/centaur_64.c           |  2 +-
 arch/x86/kernel/cpu/common.c               | 20 ++++++++++----------
 arch/x86/kernel/cpu/cpu.h                  | 11 ++++++-----
 arch/x86/kernel/cpu/cyrix.c                | 16 ++++++++--------
 arch/x86/kernel/cpu/intel.c                |  2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c      |  8 ++++----
 arch/x86/kernel/cpu/transmeta.c            |  2 +-
 arch/x86/kernel/cpu/umc.c                  |  2 +-
 arch/x86/kernel/mmconf-fam10h_64.c         |  2 +-
 12 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	u32 regs[4];
 	const struct cpuid_bit *cb;
 
-	static const struct cpuid_bit cpuid_bits[] = {
+	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
 		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
 		{ 0, 0, 0, 0 }
 	};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f47df59016c..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -502,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
 }
 #endif
 
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
 	.c_vendor	= "AMD",
 	.c_ident	= { "AuthenticAMD" },
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..983e0830f0d 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -468,7 +468,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 	return size;
 }
 
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
 	.c_early_init	= early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index a1625f5a1e7..51b09c48c9c 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -25,7 +25,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 }
 
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev centaur_cpu_dev __cpuinitconst = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
 	.c_early_init	= early_init_centaur,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f8869978bbb..54cbe7690f9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -70,7 +70,7 @@ cpumask_t cpu_sibling_setup_map;
 #endif /* CONFIG_X86_32 */
 
 
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -274,9 +274,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
  */
 
 /* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 {
-	struct cpu_model_info *info;
+	const struct cpu_model_info *info;
 
 	if (c->x86_model >= 16)
 		return NULL;	/* Range check */
@@ -321,7 +321,7 @@ void switch_to_new_gdt(int cpu)
 	load_percpu_segment(cpu);
 }
 
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
 
 static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
@@ -340,7 +340,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 	.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -634,12 +634,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 void __init early_cpu_init(void)
 {
-	struct cpu_dev **cdev;
+	const struct cpu_dev *const *cdev;
 	int count = 0;
 
 	printk("KERNEL supported cpus:\n");
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-		struct cpu_dev *cpudev = *cdev;
+		const struct cpu_dev *cpudev = *cdev;
 		unsigned int j;
 
 		if (count >= X86_VENDOR_NUM)
@@ -768,7 +768,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* If the model name is still unset, do table lookup. */
 	if (!c->x86_model_id[0]) {
-		char *p;
+		const char *p;
 		p = table_lookup_model(c);
 		if (p)
 			strcpy(c->x86_model_id, p);
@@ -847,7 +847,7 @@ struct msr_range {
 	unsigned max;
 };
 
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
 	{ 0x00000000, 0x00000418},
 	{ 0xc0000000, 0xc000040b},
 	{ 0xc0010000, 0xc0010142},
@@ -894,7 +894,7 @@ __setup("noclflush", setup_noclflush);
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
-	char *vendor = NULL;
+	const char *vendor = NULL;
 
 	if (c->x86_vendor < X86_VENDOR_NUM)
 		vendor = this_cpu->c_vendor;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..9469ecb5aeb 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -5,15 +5,15 @@
 struct cpu_model_info {
 	int vendor;
 	int family;
-	char *model_names[16];
+	const char *model_names[16];
 };
 
 /* attempt to consolidate cpu attributes */
 struct cpu_dev {
-	char	* c_vendor;
+	const char	* c_vendor;
 
 	/* some have two possibilities for cpuid string */
-	char	* c_ident[2];	
+	const char	* c_ident[2];
 
 	struct		cpu_model_info c_models[4];
 
@@ -25,11 +25,12 @@ struct cpu_dev {
 };
 
 #define cpu_dev_register(cpu_devX) \
-	static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+	static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
 	__attribute__((__section__(".x86_cpu_dev.init"))) = \
 	&cpu_devX;
 
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+			    *const __x86_cpu_dev_end[];
 
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
  */
 static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
 
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
 	"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
 	"M II ", "Unknown"
 };
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
 	"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
 	"SRx2", "DRx2"
 };
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
 	"S", "S2", "Se", "S2e"
 };
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
 	"DX", "DX2", "?", "?", "?", "DX4"
 };
 static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
 
 /*
  * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
 	}
 }
 
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
 	.c_vendor	= "Cyrix",
 	.c_ident	= { "CyrixInstead" },
 	.c_early_init	= early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
 
 cpu_dev_register(cyrix_cpu_dev);
 
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
 	.c_vendor	= "NSC",
 	.c_ident	= { "Geode by NSC" },
 	.c_init		= init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 191117f1ad5..968f15129ed 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -410,7 +410,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
 }
 #endif
 
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
 	.c_vendor	= "Intel",
 	.c_ident	= { "GenuineIntel" },
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5..c471eb1a389 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
 };
 
 /* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
 {
 	{ 0x06, LVL_1_INST, 8 },	/* 4-way set assoc, 32 byte line size */
 	{ 0x08, LVL_1_INST, 16 },	/* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
 	unsigned val;
 };
 
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
 	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
 	[8] = 16, [0xa] = 32, [0xb] = 48,
 	[0xc] = 64,
 	[0xf] = 0xffff // ??
 };
 
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
 
 static void __cpuinit
 amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
 	.c_vendor	= "Transmeta",
 	.c_ident	= { "GenuineTMx86", "TransmetaCPU" },
 	.c_early_init	= early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
  * so no special init takes place.
  */
 
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
 	.c_vendor	= "UMC",
 	.c_ident	= { "UMC UMC UMC" },
 	.c_models = {
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
         return 0;
 }
 
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
         {
                 .callback = set_check_enable_amd_mmconf,
                 .ident = "Sun Microsystems Machine",
-- 
cgit v1.2.3


From 7a81d9a7da03d2f27840d659f97ef140d032f609 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 12:45:15 +0000
Subject: x86: smarten /proc/interrupts output

Impact: change /proc/interrupts output ABI

With the number of interrupts on large systems growing, assumptions on
the width an interrupt number requires when converted to a decimal
string turn invalid. Therefore, calculate the maximum number of digits
dynamically.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B911EB.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b864341dcc4..b8ac3b6cf77 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -45,16 +45,16 @@ void ack_bad_irq(unsigned int irq)
 /*
  * /proc/interrupts printing:
  */
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
 {
 	int j;
 
-	seq_printf(p, "NMI: ");
+	seq_printf(p, "%*s: ", prec, "NMI");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
 	seq_printf(p, "  Non-maskable interrupts\n");
 #ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "LOC: ");
+	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
@@ -66,40 +66,40 @@ static int show_other_interrupts(struct seq_file *p)
 		seq_printf(p, "  Platform interrupts\n");
 	}
 #ifdef CONFIG_SMP
-	seq_printf(p, "RES: ");
+	seq_printf(p, "%*s: ", prec, "RES");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
 	seq_printf(p, "  Rescheduling interrupts\n");
-	seq_printf(p, "CAL: ");
+	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_printf(p, "  Function call interrupts\n");
-	seq_printf(p, "TLB: ");
+	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
 	seq_printf(p, "  TLB shootdowns\n");
 #endif
 #ifdef CONFIG_X86_MCE
-	seq_printf(p, "TRM: ");
+	seq_printf(p, "%*s: ", prec, "TRM");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
 	seq_printf(p, "  Thermal event interrupts\n");
 # ifdef CONFIG_X86_64
-	seq_printf(p, "THR: ");
+	seq_printf(p, "%*s: ", prec, "THR");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
 #ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "SPU: ");
+	seq_printf(p, "%*s: ", prec, "SPU");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
 #endif
-	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
-	seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
 	return 0;
 }
@@ -107,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p)
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i = *(loff_t *) v, j, prec;
 	struct irqaction *action;
 	struct irq_desc *desc;
 
 	if (i > nr_irqs)
 		return 0;
 
+	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+		j *= 10;
+
 	if (i == nr_irqs)
-		return show_other_interrupts(p);
+		return show_other_interrupts(p, prec);
 
 	/* print header */
 	if (i == 0) {
-		seq_printf(p, "           ");
+		seq_printf(p, "%*s", prec + 8, "");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
@@ -140,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	if (!action && !any_count)
 		goto out;
 
-	seq_printf(p, "%3d: ", i);
+	seq_printf(p, "%*d: ", prec, i);
 #ifndef CONFIG_SMP
 	seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-- 
cgit v1.2.3


From 13c6c53282d99c82e79b02477efd2c1e30a991ef Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 12:37:34 +0000
Subject: x86, 32-bit: also use cpuinfo_x86's x86_{phys,virt}_bits members

Impact: 32/64-bit consolidation

In a first step, this allows fixing phys_addr_valid() for PAE (which
until now reported all addresses to be valid). Subsequently, this will
also allow simplifying some MTRR handling code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B9101E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 12 +++++++++++-
 arch/x86/kernel/cpu/intel.c  |  5 +++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 826d5c87627..a95e9480bb9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -549,13 +549,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 		}
 	}
 
-#ifdef CONFIG_X86_64
 	if (c->extended_cpuid_level >= 0x80000008) {
 		u32 eax = cpuid_eax(0x80000008);
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
+#ifdef CONFIG_X86_32
+	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+		c->x86_phys_bits = 36;
 #endif
 
 	if (c->extended_cpuid_level >= 0x80000007)
@@ -602,8 +604,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;
 
@@ -726,9 +732,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_coreid_bits = 0;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 191117f1ad5..ae769471042 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -54,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = 128;
 #endif
 
+	/* CPUID workaround for 0F33/0F34 CPU */
+	if (c->x86 == 0xF && c->x86_model == 0x3
+	    && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+		c->x86_phys_bits = 36;
+
 	/*
 	 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
 	 * with P/T states and does not stop in deep C-states
-- 
cgit v1.2.3


From 9a50156a1c7bfa65315facaffdfbed6513fcfd3b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 12:41:23 +0000
Subject: x86: properly __init-annotate recent early_printk additions

Impact: cleanup, save memory

Don't keep code resident that's only needed during startup.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B91103.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
 	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
 }
 
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
 {
 	int i;
 
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
 	writel(hi, &ehci_debug->data47);
 }
 
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
 {
 	unsigned char *bytes = buf;
 	u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
 	return ret;
 }
 
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 				 int size)
 {
 	u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 	return ret;
 }
 
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
-	int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+	int request, int value, int index, void *data, int size)
 {
 	u32 pids, addr, ctrl;
 	struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
 	return 0;
 }
 
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
 {
 	u32 portsc;
 	u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
 	return -EBUSY;
 }
 
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
 {
 	u32 status;
 	int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
 
 typedef void (*set_debug_port_t)(int port);
 
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
 {
 }
 
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
 
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
 {
 	u32 dword;
 	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
-- 
cgit v1.2.3


From 82034d6f59b4772f4233bbb61c670290803a9960 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 12:57:10 +0000
Subject: x86: clean up output resulting from update_mptable option

Impact: cleanup

Without apic=verbose, using the update_mptable option would result in
garbled and confusing output due to the inconsistent use of printk() vs
apic_printk().

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B914B6.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e8192401da4..47673e02ae5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -890,12 +890,12 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 #ifdef CONFIG_X86_IO_APIC
 				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 
-				printk(KERN_INFO "OLD ");
+				apic_printk(APIC_VERBOSE, "OLD ");
 				print_MP_intsrc_info(m);
 				i = get_MP_intsrc_index(m);
 				if (i > 0) {
 					assign_to_mpc_intsrc(&mp_irqs[i], m);
-					printk(KERN_INFO "NEW ");
+					apic_printk(APIC_VERBOSE, "NEW ");
 					print_mp_irq_info(&mp_irqs[i]);
 				} else if (!i) {
 					/* legacy, do nothing */
@@ -943,7 +943,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 			continue;
 
 		if (nr_m_spare > 0) {
-			printk(KERN_INFO "*NEW* found ");
+			apic_printk(APIC_VERBOSE, "*NEW* found\n");
 			nr_m_spare--;
 			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
 			m_spare[nr_m_spare] = NULL;
-- 
cgit v1.2.3


From 5c0e6f035df983210e4d22213aed624ced502d3d Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 12 Mar 2009 13:07:23 +0000
Subject: x86: fix code paths used by update_mptable

Impact: fix crashes under Xen due to unrobust e820 code

find_e820_area_size() must return a properly distinguishable and
out-of-bounds value when it fails, and -1UL does not meet that
criteria on i386/PAE. Additionally, callers of the function must
check against that value.

early_reserve_e820() should be prepared for the region found to be
outside of the addressable range on 32-bits.

e820_update_range_map() should not blindly update e820, but should do
all it work on the map it got a pointer passed for (which in 50% of the
cases is &e820_saved). It must also not call e820_add_region(), as that
again acts on e820 unconditionally.

The issues were found when trying to make this option work in our Xen
kernel (i.e. where some of the silent assumptions made in the code
would not hold).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B9171B.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/check.c |  2 +-
 arch/x86/kernel/e820.c  | 32 +++++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..b617b1164f1 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,7 +83,7 @@ void __init setup_bios_corruption_check(void)
 		u64 size;
 		addr = find_e820_area_size(addr, &size, PAGE_SIZE);
 
-		if (addr == 0)
+		if (!(addr + 1))
 			break;
 
 		if ((addr + size) > corruption_check_size)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee2..3cf6681ac80 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -421,7 +421,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 					u64 size, unsigned old_type,
 					unsigned new_type)
 {
-	int i;
+	unsigned int i, x;
 	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
@@ -429,7 +429,7 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 	if (size > (ULLONG_MAX - start))
 		size = ULLONG_MAX - start;
 
-	for (i = 0; i < e820.nr_map; i++) {
+	for (i = 0; i < e820x->nr_map; i++) {
 		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
 		if (ei->type != old_type)
@@ -446,14 +446,23 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 		final_end = min(start + size, ei->addr + ei->size);
 		if (final_start >= final_end)
 			continue;
-		e820_add_region(final_start, final_end - final_start,
-					 new_type);
+
+		x = e820x->nr_map;
+		if (x == ARRAY_SIZE(e820x->map)) {
+			printk(KERN_ERR "Too many memory map entries!\n");
+			break;
+		}
+		e820x->map[x].addr = final_start;
+		e820x->map[x].size = final_end - final_start;
+		e820x->map[x].type = new_type;
+		e820x->nr_map++;
+
 		real_updated_size += final_end - final_start;
 
-		ei->size -= final_end - final_start;
 		if (ei->addr < final_start)
 			continue;
 		ei->addr = final_end;
+		ei->size -= final_end - final_start;
 	}
 	return real_updated_size;
 }
@@ -1020,8 +1029,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
 			continue;
 		return addr;
 	}
-	return -1UL;
 
+	return -1ULL;
 }
 
 /*
@@ -1034,13 +1043,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	u64 start;
 
 	start = startt;
-	while (size < sizet)
+	while (size < sizet && (start + 1))
 		start = find_e820_area_size(start, &size, align);
 
 	if (size < sizet)
 		return 0;
 
+#ifdef CONFIG_X86_32
+	if (start >= MAXMEM)
+		return 0;
+	if (start + size > MAXMEM)
+		size = MAXMEM - start;
+#endif
+
 	addr = round_down(start + size - sizet, align);
+	if (addr < start)
+		return 0;
 	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
 	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
 	printk(KERN_INFO "update e820 for early_reserve_e820\n");
-- 
cgit v1.2.3


From 8ad9790588ee2e69118b2b294ddab6f3f0379ad9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 12 Mar 2009 18:43:54 -0700
Subject: x86: more MTRR debug printouts

Impact: improve MTRR debugging messages

There's still inefficiencies suspected with the MTRR sanitizing
code, so make sure we get all the info we need from a dmesg.

- Remove unneeded mtrr_show

 (It will only printout one time by first cpu, so it is no big deal.)

- Also print out directly from get_mtrr, because it doesn't update mtrr_state.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B9BA5A.40108@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 95 ++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 44 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95..96440352010 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,14 +33,6 @@ u64 mtrr_tom2;
 struct mtrr_state_type mtrr_state = {};
 EXPORT_SYMBOL_GPL(mtrr_state);
 
-static int __initdata mtrr_show;
-static int __init mtrr_debug(char *opt)
-{
-	mtrr_show = 1;
-	return 0;
-}
-early_param("mtrr.show", mtrr_debug);
-
 /*
  * Returns the effective MTRR type for the region
  * Error returns:
@@ -193,13 +185,51 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
 	unsigned i;
 
 	for (i = 0; i < 8; ++i, ++types, base += step)
-		printk(KERN_INFO "MTRR %05X-%05X %s\n",
+		printk(KERN_INFO "  %05X-%05X %s\n",
 			base, base + step - 1, mtrr_attrib_to_str(*types));
 }
 
 static void prepare_set(void);
 static void post_set(void);
 
+static void __init print_mtrr_state(void)
+{
+	unsigned int i;
+	int high_width;
+
+	printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
+	if (mtrr_state.have_fixed) {
+		printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
+		       mtrr_state.enabled & 1 ? "en" : "dis");
+		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+		for (i = 0; i < 2; ++i)
+			print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+		for (i = 0; i < 8; ++i)
+			print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+	}
+	printk(KERN_INFO "MTRR variable ranges %sabled:\n",
+	       mtrr_state.enabled & 2 ? "en" : "dis");
+	high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+	for (i = 0; i < num_var_ranges; ++i) {
+		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+			printk(KERN_INFO "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+			       i,
+			       high_width,
+			       mtrr_state.var_ranges[i].base_hi,
+			       mtrr_state.var_ranges[i].base_lo >> 12,
+			       high_width,
+			       mtrr_state.var_ranges[i].mask_hi,
+			       mtrr_state.var_ranges[i].mask_lo >> 12,
+			       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+		else
+			printk(KERN_INFO "   %u disabled\n", i);
+	}
+	if (mtrr_tom2) {
+		printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
+				  mtrr_tom2, mtrr_tom2>>20);
+	}
+}
+
 /*  Grab all of the MTRR state for this CPU into *state  */
 void __init get_mtrr_state(void)
 {
@@ -231,41 +261,9 @@ void __init get_mtrr_state(void)
 		mtrr_tom2 |= low;
 		mtrr_tom2 &= 0xffffff800000ULL;
 	}
-	if (mtrr_show) {
-		int high_width;
-
-		printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
-		if (mtrr_state.have_fixed) {
-			printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
-			       mtrr_state.enabled & 1 ? "en" : "dis");
-			print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
-			for (i = 0; i < 2; ++i)
-				print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
-			for (i = 0; i < 8; ++i)
-				print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
-		}
-		printk(KERN_INFO "MTRR variable ranges %sabled:\n",
-		       mtrr_state.enabled & 2 ? "en" : "dis");
-		high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
-		for (i = 0; i < num_var_ranges; ++i) {
-			if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
-				printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
-				       i,
-				       high_width,
-				       mtrr_state.var_ranges[i].base_hi,
-				       mtrr_state.var_ranges[i].base_lo >> 12,
-				       high_width,
-				       mtrr_state.var_ranges[i].mask_hi,
-				       mtrr_state.var_ranges[i].mask_lo >> 12,
-				       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
-			else
-				printk(KERN_INFO "MTRR %u disabled\n", i);
-		}
-		if (mtrr_tom2) {
-			printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-					  mtrr_tom2, mtrr_tom2>>20);
-		}
-	}
+
+	print_mtrr_state();
+
 	mtrr_state_set = 1;
 
 	/* PAT setup for BP. We need to go through sync steps here */
@@ -377,7 +375,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 	unsigned int tmp, hi;
 
+	/*
+	 * get_mtrr doesn't need to update mtrr_state, also it could be called
+	 * from any cpu, so try to print it out directly.
+	 */
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
 	if ((mask_lo & 0x800) == 0) {
 		/*  Invalid (i.e. free) range  */
 		*base = 0;
@@ -407,6 +410,10 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 	*size = -mask_lo;
 	*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
 	*type = base_lo & 0xff;
+
+	printk(KERN_DEBUG "  get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
+			smp_processor_id(), reg, *base, *size,
+			mtrr_attrib_to_str(*type & 0xff));
 }
 
 /**
-- 
cgit v1.2.3


From c1ab7e93c6ddf8a068719b97b7e26c3d8eba7c32 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Mar 2009 20:05:46 -0700
Subject: x86: print out mtrr_range_state when user specify size

Impact: print more debug info

Keep it consistent with autodetect version.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B87C0A.4010105@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b825..311a4734609 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -1424,6 +1424,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
 
 		if (!result[i].bad) {
 			set_var_mtrr_all(address_bits);
+			printk(KERN_DEBUG "New variable MTRRs\n");
+			print_out_mtrr_range_state();
 			return 1;
 		}
 		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-- 
cgit v1.2.3


From 0d890355bff25e1dc03a577a90ed80741489ca54 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Mar 2009 20:07:39 -0700
Subject: x86: separate mtrr cleanup/mtrr_e820 trim to separate file

Impact: cleanup

mtrr main.c is too big, seperate mtrr cleanup and mtrr e820 trim
code to another file.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B87C7B.80809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/Makefile  |    2 +-
 arch/x86/kernel/cpu/mtrr/cleanup.c | 1089 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mtrr/main.c    | 1055 +---------------------------------
 arch/x86/kernel/cpu/mtrr/mtrr.h    |    3 +
 4 files changed, 1094 insertions(+), 1055 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mtrr/cleanup.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc053364..f4361b56f8e 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y		:= main.o if.o generic.o state.o
+obj-y		:= main.o if.o generic.o state.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 00000000000..58b58bbf7eb
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1089 @@
+/*  MTRR (Memory Type Range Register) cleanup
+
+    Copyright (C) 2009 Yinghai Lu
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public
+    License along with this library; if not, write to the Free
+    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/kvm_para.h>
+#include "mtrr.h"
+
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	/* out of slots */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	int i;
+
+	/* try to merge it with old one */
+	for (i = 0; i < nr_range; i++) {
+		unsigned long final_start, final_end;
+		unsigned long common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end + 1)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+	/* need to add that */
+	return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+	int i, j;
+
+	for (j = 0; j < RANGE_NUM; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end + 1) {
+			range[j].start = end + 1;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start - 1) {
+			range[j].end = start - 1;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* find the new spare */
+			for (i = 0; i < RANGE_NUM; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < RANGE_NUM) {
+				range[i].end = range[j].end;
+				range[i].start = end + 1;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start - 1;
+			continue;
+		}
+	}
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+	const struct res_range *r1 = x1;
+	const struct res_range *r2 = x2;
+	long start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+	unsigned long base_pfn;
+	unsigned long size_pfn;
+	mtrr_type type;
+};
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
+{
+	unsigned long i, base, size;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		nr_range = add_range_with_merge(range, nr_range, base,
+						base + size - 1);
+	}
+	if (debug_print) {
+		printk(KERN_DEBUG "After WB checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_UNCACHABLE &&
+		    type != MTRR_TYPE_WRPROT)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			continue;
+		base = range_state[i].base_pfn;
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,
+				 extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	if  (debug_print) {
+		printk(KERN_DEBUG "After UC checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	if  (debug_print) {
+		printk(KERN_DEBUG "After sorting\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* clear those is not used */
+	for (i = nr_range; i < RANGE_NUM; i++)
+		memset(&range[i], 0, sizeof(range[i]));
+
+	return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+	unsigned long sum;
+	int i;
+
+	sum = 0;
+	for (i = 0; i < nr_range; i++)
+		sum += range[i].end + 1 - range[i].start;
+
+	return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+	debug_print = 1;
+	return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+struct var_mtrr_state {
+	unsigned long	range_startk;
+	unsigned long	range_sizek;
+	unsigned long	chunk_sizek;
+	unsigned long	gran_sizek;
+	unsigned int	reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type, unsigned int address_bits)
+{
+	u32 base_lo, base_hi, mask_lo, mask_hi;
+	u64 base, mask;
+
+	if (!sizek) {
+		fill_mtrr_var_range(reg, 0, 0, 0, 0);
+		return;
+	}
+
+	mask = (1ULL << address_bits) - 1;
+	mask &= ~((((u64)sizek) << 10) - 1);
+
+	base  = ((u64)basek) << 10;
+
+	base |= type;
+	mask |= 0x800;
+
+	base_lo = base & ((1ULL<<32) - 1);
+	base_hi = base >> 32;
+
+	mask_lo = mask & ((1ULL<<32) - 1);
+	mask_hi = mask >> 32;
+
+	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type)
+{
+	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+	range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+	unsigned long basek, sizek;
+	unsigned char type;
+	unsigned int reg;
+
+	for (reg = 0; reg < num_var_ranges; reg++) {
+		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+		type = range_state[reg].type;
+
+		set_var_mtrr(reg, basek, sizek, type, address_bits);
+	}
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+	char factor;
+	unsigned long base = sizek;
+
+	if (base & ((1<<10) - 1)) {
+		/* not MB alignment */
+		factor = 'K';
+	} else if (base & ((1<<20) - 1)) {
+		factor = 'M';
+		base >>= 10;
+	} else {
+		factor = 'G';
+		base >>= 20;
+	}
+
+	*factorp = factor;
+
+	return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+	      unsigned long range_sizek, unsigned char type)
+{
+	if (!range_sizek || (reg >= num_var_ranges))
+		return reg;
+
+	while (range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+
+		/* Compute the maximum size I can make a range */
+		if (range_startk)
+			max_align = ffs(range_startk) - 1;
+		else
+			max_align = 32;
+		align = fls(range_sizek) - 1;
+		if (align > max_align)
+			align = max_align;
+
+		sizek = 1 << align;
+		if (debug_print) {
+			char start_factor = 'K', size_factor = 'K';
+			unsigned long start_base, size_base;
+
+			start_base = to_size_factor(range_startk,
+							 &start_factor),
+			size_base = to_size_factor(sizek, &size_factor),
+
+			printk(KERN_DEBUG "Setting variable MTRR %d, "
+				"base: %ld%cB, range: %ld%cB, type %s\n",
+				reg, start_base, start_factor,
+				size_base, size_factor,
+				(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+				   ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+				);
+		}
+		save_var_mtrr(reg++, range_startk, sizek, type);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= num_var_ranges)
+			break;
+	}
+	return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+			unsigned long sizek)
+{
+	unsigned long hole_basek, hole_sizek;
+	unsigned long second_basek, second_sizek;
+	unsigned long range0_basek, range0_sizek;
+	unsigned long range_basek, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+
+	hole_basek = 0;
+	hole_sizek = 0;
+	second_basek = 0;
+	second_sizek = 0;
+	chunk_sizek = state->chunk_sizek;
+	gran_sizek = state->gran_sizek;
+
+	/* align with gran size, prevent small block used up MTRRs */
+	range_basek = ALIGN(state->range_startk, gran_sizek);
+	if ((range_basek > basek) && basek)
+		return second_sizek;
+	state->range_sizek -= (range_basek - state->range_startk);
+	range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+	while (range_sizek > state->range_sizek) {
+		range_sizek -= gran_sizek;
+		if (!range_sizek)
+			return 0;
+	}
+	state->range_sizek = range_sizek;
+
+	/* try to append some small hole */
+	range0_basek = state->range_startk;
+	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+	/* no increase */
+	if (range0_sizek == state->range_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + state->range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK);
+		return 0;
+	}
+
+	/* only cut back, when it is not the last */
+	if (sizek) {
+		while (range0_basek + range0_sizek > (basek + sizek)) {
+			if (range0_sizek >= chunk_sizek)
+				range0_sizek -= chunk_sizek;
+			else
+				range0_sizek = 0;
+
+			if (!range0_sizek)
+				break;
+		}
+	}
+
+second_try:
+	range_basek = range0_basek + range0_sizek;
+
+	/* one hole in the middle */
+	if (range_basek > basek && range_basek <= (basek + sizek))
+		second_sizek = range_basek - basek;
+
+	if (range0_sizek > state->range_sizek) {
+
+		/* one hole in middle or at end */
+		hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+		/* hole size should be less than half of range0 size */
+		if (hole_sizek >= (range0_sizek >> 1) &&
+		    range0_sizek >= chunk_sizek) {
+			range0_sizek -= chunk_sizek;
+			second_sizek = 0;
+			hole_sizek = 0;
+
+			goto second_try;
+		}
+	}
+
+	if (range0_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + range0_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				range0_sizek, MTRR_TYPE_WRBACK);
+	}
+
+	if (range0_sizek < state->range_sizek) {
+		/* need to handle left over */
+		range_sizek = state->range_sizek - range0_sizek;
+
+		if (debug_print)
+			printk(KERN_DEBUG "range: %016lx - %016lx\n",
+				 range_basek<<10,
+				 (range_basek + range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range_basek,
+				 range_sizek, MTRR_TYPE_WRBACK);
+	}
+
+	if (hole_sizek) {
+		hole_basek = range_basek - hole_sizek - second_sizek;
+		if (debug_print)
+			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+				 hole_basek<<10,
+				 (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek,
+				 hole_sizek, MTRR_TYPE_UNCACHABLE);
+	}
+
+	return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+		   unsigned long size_pfn)
+{
+	unsigned long basek, sizek;
+	unsigned long second_sizek = 0;
+
+	if (state->reg >= num_var_ranges)
+		return;
+
+	basek = base_pfn << (PAGE_SHIFT - 10);
+	sizek = size_pfn << (PAGE_SHIFT - 10);
+
+	/* See if I can merge with the last range */
+	if ((basek <= 1024) ||
+	    (state->range_startk + state->range_sizek == basek)) {
+		unsigned long endk = basek + sizek;
+		state->range_sizek = endk - state->range_startk;
+		return;
+	}
+	/* Write the range mtrrs */
+	if (state->range_sizek != 0)
+		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+	/* Allocate an msr */
+	state->range_startk = basek + second_sizek;
+	state->range_sizek  = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_chunk_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_gran_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+	if (arg)
+		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+		    u64 chunk_size, u64 gran_size)
+{
+	struct var_mtrr_state var_state;
+	int i;
+	int num_reg;
+
+	var_state.range_startk	= 0;
+	var_state.range_sizek	= 0;
+	var_state.reg		= 0;
+	var_state.chunk_sizek	= chunk_size >> 10;
+	var_state.gran_sizek	= gran_size >> 10;
+
+	memset(range_state, 0, sizeof(range_state));
+
+	/* Write the range etc */
+	for (i = 0; i < nr_range; i++)
+		set_var_mtrr_range(&var_state, range[i].start,
+				   range[i].end - range[i].start + 1);
+
+	/* Write the last range */
+	if (var_state.range_sizek != 0)
+		range_to_mtrr_with_hole(&var_state, 0, 0);
+
+	num_reg = var_state.reg;
+	/* Clear out the extra MTRR's */
+	while (var_state.reg < num_var_ranges) {
+		save_var_mtrr(var_state.reg, 0, 0, 0);
+		var_state.reg++;
+	}
+
+	return num_reg;
+}
+
+struct mtrr_cleanup_result {
+	unsigned long gran_sizek;
+	unsigned long chunk_sizek;
+	unsigned long lose_cover_sizek;
+	unsigned int num_reg;
+	int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT	136
+#define PSHIFT		(PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+	int i;
+	char start_factor = 'K', size_factor = 'K';
+	unsigned long start_base, size_base;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+
+		size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+		if (!size_base)
+			continue;
+
+		size_base = to_size_factor(size_base, &size_factor),
+		start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+		start_base = to_size_factor(start_base, &start_factor),
+		type = range_state[i].type;
+
+		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+			i, start_base, start_factor,
+			size_base, size_factor,
+			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+			    ((type == MTRR_TYPE_WRPROT) ? "WP" :
+			     ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+			);
+	}
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+	int i;
+	mtrr_type type;
+	unsigned long size;
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		size = range_state[i].size_pfn;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		if (type == MTRR_TYPE_WRPROT)
+			type = MTRR_TYPE_UNCACHABLE;
+		num[type]++;
+	}
+
+	/* check if we got UC entries */
+	if (!num[MTRR_TYPE_UNCACHABLE])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	return 1;
+}
+
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+					 unsigned long extra_remove_base,
+					 unsigned long extra_remove_size,
+					 int i)
+{
+	int num_reg;
+	static struct res_range range_new[RANGE_NUM];
+	static int nr_range_new;
+	unsigned long range_sums_new;
+
+	/* convert ranges to var ranges state */
+	num_reg = x86_setup_var_mtrrs(range, nr_range,
+						chunk_size, gran_size);
+
+	/* we got new setting in range_state, check it */
+	memset(range_new, 0, sizeof(range_new));
+	nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+				extra_remove_base, extra_remove_size);
+	range_sums_new = sum_ranges(range_new, nr_range_new);
+
+	result[i].chunk_sizek = chunk_size >> 10;
+	result[i].gran_sizek = gran_size >> 10;
+	result[i].num_reg = num_reg;
+	if (range_sums < range_sums_new) {
+		result[i].lose_cover_sizek =
+			(range_sums_new - range_sums) << PSHIFT;
+		result[i].bad = 1;
+	} else
+		result[i].lose_cover_sizek =
+			(range_sums - range_sums_new) << PSHIFT;
+
+	/* double check it */
+	if (!result[i].bad && !result[i].lose_cover_sizek) {
+		if (nr_range_new != nr_range ||
+			memcmp(range, range_new, sizeof(range)))
+				result[i].bad = 1;
+	}
+
+	if (!result[i].bad && (range_sums - range_sums_new <
+				min_loss_pfn[num_reg])) {
+		min_loss_pfn[num_reg] =
+			range_sums - range_sums_new;
+	}
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+	char gran_factor, chunk_factor, lose_factor;
+	unsigned long gran_base, chunk_base, lose_base;
+
+	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+	printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+			result[i].bad ? "*BAD*" : " ",
+			gran_base, gran_factor, chunk_base, chunk_factor);
+	printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
+			result[i].num_reg, result[i].bad ? "-" : "",
+			lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+	int i;
+	int num_reg_good;
+	int index_good;
+
+	if (nr_mtrr_spare_reg >= num_var_ranges)
+		nr_mtrr_spare_reg = num_var_ranges - 1;
+	num_reg_good = -1;
+	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+		if (!min_loss_pfn[i])
+			num_reg_good = i;
+	}
+
+	index_good = -1;
+	if (num_reg_good != -1) {
+		for (i = 0; i < NUM_RESULT; i++) {
+			if (!result[i].bad &&
+			    result[i].num_reg == num_reg_good &&
+			    !result[i].lose_cover_sizek) {
+				index_good = i;
+				break;
+			}
+		}
+	}
+
+	return index_good;
+}
+
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+	unsigned long extra_remove_base, extra_remove_size;
+	unsigned long base, size, def, dummy;
+	mtrr_type type;
+	u64 chunk_size, gran_size;
+	int index_good;
+	int i;
+
+	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* check if we need handle it and can handle it */
+	if (!mtrr_need_cleanup())
+		return 0;
+
+	/* print original var MTRRs at first, for debugging: */
+	printk(KERN_DEBUG "original variable MTRRs\n");
+	print_out_mtrr_range_state();
+
+	memset(range, 0, sizeof(range));
+	extra_remove_size = 0;
+	extra_remove_base = 1 << (32 - PAGE_SHIFT);
+	if (mtrr_tom2)
+		extra_remove_size =
+			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+					  extra_remove_size);
+	/*
+	 * [0, 1M) should always be coverred by var mtrr with WB
+	 * and fixed mtrrs should take effective before var mtrr for it
+	 */
+	nr_range = add_range_with_merge(range, nr_range, 0,
+					(1ULL<<(20 - PAGE_SHIFT)) - 1);
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+	range_sums = sum_ranges(range, nr_range);
+	printk(KERN_INFO "total RAM coverred: %ldM\n",
+	       range_sums >> (20 - PAGE_SHIFT));
+
+	if (mtrr_chunk_size && mtrr_gran_size) {
+		i = 0;
+		mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+				      extra_remove_base, extra_remove_size, i);
+
+		mtrr_print_out_one_result(i);
+
+		if (!result[i].bad) {
+			set_var_mtrr_all(address_bits);
+			printk(KERN_DEBUG "New variable MTRRs\n");
+			print_out_mtrr_range_state();
+			return 1;
+		}
+		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+		       "will find optimal one\n");
+	}
+
+	i = 0;
+	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+	memset(result, 0, sizeof(result));
+	for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+		for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+		     chunk_size <<= 1) {
+
+			if (i >= NUM_RESULT)
+				continue;
+
+			mtrr_calc_range_state(chunk_size, gran_size,
+				      extra_remove_base, extra_remove_size, i);
+			if (debug_print) {
+				mtrr_print_out_one_result(i);
+				printk(KERN_INFO "\n");
+			}
+
+			i++;
+		}
+	}
+
+	/* try to find the optimal index */
+	index_good = mtrr_search_optimal_index();
+
+	if (index_good != -1) {
+		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		i = index_good;
+		mtrr_print_out_one_result(i);
+
+		/* convert ranges to var ranges state */
+		chunk_size = result[i].chunk_sizek;
+		chunk_size <<= 10;
+		gran_size = result[i].gran_sizek;
+		gran_size <<= 10;
+		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		set_var_mtrr_all(address_bits);
+		printk(KERN_DEBUG "New variable MTRRs\n");
+		print_out_mtrr_range_state();
+		return 1;
+	} else {
+		/* print out all */
+		for (i = 0; i < NUM_RESULT; i++)
+			mtrr_print_out_one_result(i);
+	}
+
+	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+	return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+	return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+	disable_mtrr_trim = 1;
+	return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+	u32 l, h;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return 0;
+	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+		return 0;
+	/* In case some hypervisor doesn't pass SYSCFG through */
+	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+		return 0;
+	/*
+	 * Memory between 4GB and top of mem is forced WB by this magic bit.
+	 * Reserved before K8RevF, but should be zero there.
+	 */
+	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+		 (Tom2Enabled | Tom2ForceMemTypeWB))
+		return 1;
+	return 0;
+}
+
+static u64 __init real_trim_memory(unsigned long start_pfn,
+				   unsigned long limit_pfn)
+{
+	u64 trim_start, trim_size;
+	trim_start = start_pfn;
+	trim_start <<= PAGE_SHIFT;
+	trim_size = limit_pfn;
+	trim_size <<= PAGE_SHIFT;
+	trim_size -= trim_start;
+
+	return e820_update_range(trim_start, trim_size, E820_RAM,
+				E820_RESERVED);
+}
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations.  This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+	unsigned long i, base, size, highest_pfn = 0, def, dummy;
+	mtrr_type type;
+	u64 total_trim_size;
+
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+	/*
+	 * Make sure we only trim uncachable memory on machines that
+	 * support the Intel MTRR architecture:
+	 */
+	if (!is_cpu(INTEL) || disable_mtrr_trim)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* Find highest cached pfn */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		if (highest_pfn < base + size)
+			highest_pfn = base + size;
+	}
+
+	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
+	if (!highest_pfn) {
+		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+		return 0;
+	}
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* no entry for WB? */
+	if (!num[MTRR_TYPE_WRBACK])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	nr_range = 0;
+	if (mtrr_tom2) {
+		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+		if (highest_pfn < range[nr_range].end + 1)
+			highest_pfn = range[nr_range].end + 1;
+		nr_range++;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+	total_trim_size = 0;
+	/* check the head */
+	if (range[0].start)
+		total_trim_size += real_trim_memory(0, range[0].start);
+	/* check the holes */
+	for (i = 0; i < nr_range - 1; i++) {
+		if (range[i].end + 1 < range[i+1].start)
+			total_trim_size += real_trim_memory(range[i].end + 1,
+							    range[i+1].start);
+	}
+	/* check the top */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn)
+		total_trim_size += real_trim_memory(range[i].end + 1,
+							 end_pfn);
+
+	if (total_trim_size) {
+		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+			" all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
+
+		if (!changed_by_mtrr_cleanup)
+			WARN_ON(1);
+
+		printk(KERN_INFO "update e820 for mtrr\n");
+		update_e820();
+
+		return 1;
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 311a4734609..5c2e266f41d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -610,1060 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
-
-struct res_range {
-	unsigned long start;
-	unsigned long end;
-};
-
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
-			      unsigned long end)
-{
-	/* out of slots */
-	if (nr_range >= RANGE_NUM)
-		return nr_range;
-
-	range[nr_range].start = start;
-	range[nr_range].end = end;
-
-	nr_range++;
-
-	return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
-			      unsigned long end)
-{
-	int i;
-
-	/* try to merge it with old one */
-	for (i = 0; i < nr_range; i++) {
-		unsigned long final_start, final_end;
-		unsigned long common_start, common_end;
-
-		if (!range[i].end)
-			continue;
-
-		common_start = max(range[i].start, start);
-		common_end = min(range[i].end, end);
-		if (common_start > common_end + 1)
-			continue;
-
-		final_start = min(range[i].start, start);
-		final_end = max(range[i].end, end);
-
-		range[i].start = final_start;
-		range[i].end =  final_end;
-		return nr_range;
-	}
-
-	/* need to add that */
-	return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
-	int i, j;
-
-	for (j = 0; j < RANGE_NUM; j++) {
-		if (!range[j].end)
-			continue;
-
-		if (start <= range[j].start && end >= range[j].end) {
-			range[j].start = 0;
-			range[j].end = 0;
-			continue;
-		}
-
-		if (start <= range[j].start && end < range[j].end &&
-		    range[j].start < end + 1) {
-			range[j].start = end + 1;
-			continue;
-		}
-
-
-		if (start > range[j].start && end >= range[j].end &&
-		    range[j].end > start - 1) {
-			range[j].end = start - 1;
-			continue;
-		}
-
-		if (start > range[j].start && end < range[j].end) {
-			/* find the new spare */
-			for (i = 0; i < RANGE_NUM; i++) {
-				if (range[i].end == 0)
-					break;
-			}
-			if (i < RANGE_NUM) {
-				range[i].end = range[j].end;
-				range[i].start = end + 1;
-			} else {
-				printk(KERN_ERR "run of slot in ranges\n");
-			}
-			range[j].end = start - 1;
-			continue;
-		}
-	}
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
-	const struct res_range *r1 = x1;
-	const struct res_range *r2 = x2;
-	long start1, start2;
-
-	start1 = r1->start;
-	start2 = r2->start;
-
-	return start1 - start2;
-}
-
-struct var_mtrr_range_state {
-	unsigned long base_pfn;
-	unsigned long size_pfn;
-	mtrr_type type;
-};
-
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
-		       unsigned long extra_remove_base,
-		       unsigned long extra_remove_size)
-{
-	unsigned long i, base, size;
-	mtrr_type type;
-
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_WRBACK)
-			continue;
-		base = range_state[i].base_pfn;
-		size = range_state[i].size_pfn;
-		nr_range = add_range_with_merge(range, nr_range, base,
-						base + size - 1);
-	}
-	if (debug_print) {
-		printk(KERN_DEBUG "After WB checking\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* take out UC ranges */
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_UNCACHABLE &&
-		    type != MTRR_TYPE_WRPROT)
-			continue;
-		size = range_state[i].size_pfn;
-		if (!size)
-			continue;
-		base = range_state[i].base_pfn;
-		subtract_range(range, base, base + size - 1);
-	}
-	if (extra_remove_size)
-		subtract_range(range, extra_remove_base,
-				 extra_remove_base + extra_remove_size  - 1);
-
-	/* get new range num */
-	nr_range = 0;
-	for (i = 0; i < RANGE_NUM; i++) {
-		if (!range[i].end)
-			continue;
-		nr_range++;
-	}
-	if  (debug_print) {
-		printk(KERN_DEBUG "After UC checking\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-	if  (debug_print) {
-		printk(KERN_DEBUG "After sorting\n");
-		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-				 range[i].start, range[i].end + 1);
-	}
-
-	/* clear those is not used */
-	for (i = nr_range; i < RANGE_NUM; i++)
-		memset(&range[i], 0, sizeof(range[i]));
-
-	return nr_range;
-}
-
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
-#ifdef CONFIG_MTRR_SANITIZER
-
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
-{
-	unsigned long sum;
-	int i;
-
-	sum = 0;
-	for (i = 0; i < nr_range; i++)
-		sum += range[i].end + 1 - range[i].start;
-
-	return sum;
-}
-
-static int enable_mtrr_cleanup __initdata =
-	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
-	enable_mtrr_cleanup = 0;
-	return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
-	enable_mtrr_cleanup = 1;
-	return 0;
-}
-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
-	debug_print = 1;
-	return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
-	unsigned long	range_startk;
-	unsigned long	range_sizek;
-	unsigned long	chunk_sizek;
-	unsigned long	gran_sizek;
-	unsigned int	reg;
-};
-
-static void __init
-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-		unsigned char type, unsigned int address_bits)
-{
-	u32 base_lo, base_hi, mask_lo, mask_hi;
-	u64 base, mask;
-
-	if (!sizek) {
-		fill_mtrr_var_range(reg, 0, 0, 0, 0);
-		return;
-	}
-
-	mask = (1ULL << address_bits) - 1;
-	mask &= ~((((u64)sizek) << 10) - 1);
-
-	base  = ((u64)basek) << 10;
-
-	base |= type;
-	mask |= 0x800;
-
-	base_lo = base & ((1ULL<<32) - 1);
-	base_hi = base >> 32;
-
-	mask_lo = mask & ((1ULL<<32) - 1);
-	mask_hi = mask >> 32;
-
-	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
-}
-
-static void __init
-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-		unsigned char type)
-{
-	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
-	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
-	range_state[reg].type = type;
-}
-
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
-{
-	unsigned long basek, sizek;
-	unsigned char type;
-	unsigned int reg;
-
-	for (reg = 0; reg < num_var_ranges; reg++) {
-		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
-		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
-		type = range_state[reg].type;
-
-		set_var_mtrr(reg, basek, sizek, type, address_bits);
-	}
-}
-
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
-	char factor;
-	unsigned long base = sizek;
-
-	if (base & ((1<<10) - 1)) {
-		/* not MB alignment */
-		factor = 'K';
-	} else if (base & ((1<<20) - 1)){
-		factor = 'M';
-		base >>= 10;
-	} else {
-		factor = 'G';
-		base >>= 20;
-	}
-
-	*factorp = factor;
-
-	return base;
-}
-
-static unsigned int __init
-range_to_mtrr(unsigned int reg, unsigned long range_startk,
-	      unsigned long range_sizek, unsigned char type)
-{
-	if (!range_sizek || (reg >= num_var_ranges))
-		return reg;
-
-	while (range_sizek) {
-		unsigned long max_align, align;
-		unsigned long sizek;
-
-		/* Compute the maximum size I can make a range */
-		if (range_startk)
-			max_align = ffs(range_startk) - 1;
-		else
-			max_align = 32;
-		align = fls(range_sizek) - 1;
-		if (align > max_align)
-			align = max_align;
-
-		sizek = 1 << align;
-		if (debug_print) {
-			char start_factor = 'K', size_factor = 'K';
-			unsigned long start_base, size_base;
-
-			start_base = to_size_factor(range_startk, &start_factor),
-			size_base = to_size_factor(sizek, &size_factor),
-
-			printk(KERN_DEBUG "Setting variable MTRR %d, "
-				"base: %ld%cB, range: %ld%cB, type %s\n",
-				reg, start_base, start_factor,
-				size_base, size_factor,
-				(type == MTRR_TYPE_UNCACHABLE)?"UC":
-				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
-				);
-		}
-		save_var_mtrr(reg++, range_startk, sizek, type);
-		range_startk += sizek;
-		range_sizek -= sizek;
-		if (reg >= num_var_ranges)
-			break;
-	}
-	return reg;
-}
-
-static unsigned __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
-			unsigned long sizek)
-{
-	unsigned long hole_basek, hole_sizek;
-	unsigned long second_basek, second_sizek;
-	unsigned long range0_basek, range0_sizek;
-	unsigned long range_basek, range_sizek;
-	unsigned long chunk_sizek;
-	unsigned long gran_sizek;
-
-	hole_basek = 0;
-	hole_sizek = 0;
-	second_basek = 0;
-	second_sizek = 0;
-	chunk_sizek = state->chunk_sizek;
-	gran_sizek = state->gran_sizek;
-
-	/* align with gran size, prevent small block used up MTRRs */
-	range_basek = ALIGN(state->range_startk, gran_sizek);
-	if ((range_basek > basek) && basek)
-		return second_sizek;
-	state->range_sizek -= (range_basek - state->range_startk);
-	range_sizek = ALIGN(state->range_sizek, gran_sizek);
-
-	while (range_sizek > state->range_sizek) {
-		range_sizek -= gran_sizek;
-		if (!range_sizek)
-			return 0;
-	}
-	state->range_sizek = range_sizek;
-
-	/* try to append some small hole */
-	range0_basek = state->range_startk;
-	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
-	/* no increase */
-	if (range0_sizek == state->range_sizek) {
-		if (debug_print)
-			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
-				range0_basek<<10,
-				(range0_basek + state->range_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range0_basek,
-				state->range_sizek, MTRR_TYPE_WRBACK);
-		return 0;
-	}
-
-	/* only cut back, when it is not the last */
-	if (sizek) {
-		while (range0_basek + range0_sizek > (basek + sizek)) {
-			if (range0_sizek >= chunk_sizek)
-				range0_sizek -= chunk_sizek;
-			else
-				range0_sizek = 0;
-
-			if (!range0_sizek)
-				break;
-		}
-	}
-
-second_try:
-	range_basek = range0_basek + range0_sizek;
-
-	/* one hole in the middle */
-	if (range_basek > basek && range_basek <= (basek + sizek))
-		second_sizek = range_basek - basek;
-
-	if (range0_sizek > state->range_sizek) {
-
-		/* one hole in middle or at end */
-		hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
-		/* hole size should be less than half of range0 size */
-		if (hole_sizek >= (range0_sizek >> 1) &&
-		    range0_sizek >= chunk_sizek) {
-			range0_sizek -= chunk_sizek;
-			second_sizek = 0;
-			hole_sizek = 0;
-
-			goto second_try;
-		}
-	}
-
-	if (range0_sizek) {
-		if (debug_print)
-			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
-				range0_basek<<10,
-				(range0_basek + range0_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range0_basek,
-				range0_sizek, MTRR_TYPE_WRBACK);
-	}
-
-	if (range0_sizek < state->range_sizek) {
-		/* need to handle left over */
-		range_sizek = state->range_sizek - range0_sizek;
-
-		if (debug_print)
-			printk(KERN_DEBUG "range: %016lx - %016lx\n",
-				 range_basek<<10,
-				 (range_basek + range_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, range_basek,
-				 range_sizek, MTRR_TYPE_WRBACK);
-	}
-
-	if (hole_sizek) {
-		hole_basek = range_basek - hole_sizek - second_sizek;
-		if (debug_print)
-			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
-				 hole_basek<<10,
-				 (hole_basek + hole_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, hole_basek,
-				 hole_sizek, MTRR_TYPE_UNCACHABLE);
-	}
-
-	return second_sizek;
-}
-
-static void __init
-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
-		   unsigned long size_pfn)
-{
-	unsigned long basek, sizek;
-	unsigned long second_sizek = 0;
-
-	if (state->reg >= num_var_ranges)
-		return;
-
-	basek = base_pfn << (PAGE_SHIFT - 10);
-	sizek = size_pfn << (PAGE_SHIFT - 10);
-
-	/* See if I can merge with the last range */
-	if ((basek <= 1024) ||
-	    (state->range_startk + state->range_sizek == basek)) {
-		unsigned long endk = basek + sizek;
-		state->range_sizek = endk - state->range_startk;
-		return;
-	}
-	/* Write the range mtrrs */
-	if (state->range_sizek != 0)
-		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
-
-	/* Allocate an msr */
-	state->range_startk = basek + second_sizek;
-	state->range_sizek  = sizek - second_sizek;
-}
-
-/* mininum size of mtrr block that can take hole */
-static u64 mtrr_chunk_size __initdata = (256ULL<<20);
-
-static int __init parse_mtrr_chunk_size_opt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	mtrr_chunk_size = memparse(p, &p);
-	return 0;
-}
-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-
-/* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata;
-
-static int __init parse_mtrr_gran_size_opt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	mtrr_gran_size = memparse(p, &p);
-	return 0;
-}
-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-
-static int nr_mtrr_spare_reg __initdata =
-				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
-
-static int __init parse_mtrr_spare_reg(char *arg)
-{
-	if (arg)
-		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
-
-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
-
-static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
-		    u64 chunk_size, u64 gran_size)
-{
-	struct var_mtrr_state var_state;
-	int i;
-	int num_reg;
-
-	var_state.range_startk	= 0;
-	var_state.range_sizek	= 0;
-	var_state.reg		= 0;
-	var_state.chunk_sizek	= chunk_size >> 10;
-	var_state.gran_sizek	= gran_size >> 10;
-
-	memset(range_state, 0, sizeof(range_state));
-
-	/* Write the range etc */
-	for (i = 0; i < nr_range; i++)
-		set_var_mtrr_range(&var_state, range[i].start,
-				   range[i].end - range[i].start + 1);
-
-	/* Write the last range */
-	if (var_state.range_sizek != 0)
-		range_to_mtrr_with_hole(&var_state, 0, 0);
-
-	num_reg = var_state.reg;
-	/* Clear out the extra MTRR's */
-	while (var_state.reg < num_var_ranges) {
-		save_var_mtrr(var_state.reg, 0, 0, 0);
-		var_state.reg++;
-	}
-
-	return num_reg;
-}
-
-struct mtrr_cleanup_result {
-	unsigned long gran_sizek;
-	unsigned long chunk_sizek;
-	unsigned long lose_cover_sizek;
-	unsigned int num_reg;
-	int bad;
-};
-
-/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
- */
-#define NUM_RESULT	136
-#define PSHIFT		(PAGE_SHIFT - 10)
-
-static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-
-static void __init print_out_mtrr_range_state(void)
-{
-	int i;
-	char start_factor = 'K', size_factor = 'K';
-	unsigned long start_base, size_base;
-	mtrr_type type;
-
-	for (i = 0; i < num_var_ranges; i++) {
-
-		size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
-		if (!size_base)
-			continue;
-
-		size_base = to_size_factor(size_base, &size_factor),
-		start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
-		start_base = to_size_factor(start_base, &start_factor),
-		type = range_state[i].type;
-
-		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
-			i, start_base, start_factor,
-			size_base, size_factor,
-			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
-			    ((type == MTRR_TYPE_WRPROT) ? "WP" :
-			     ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
-			);
-	}
-}
-
-static int __init mtrr_need_cleanup(void)
-{
-	int i;
-	mtrr_type type;
-	unsigned long size;
-	/* extra one for all 0 */
-	int num[MTRR_NUM_TYPES + 1];
-
-	/* check entries number */
-	memset(num, 0, sizeof(num));
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		size = range_state[i].size_pfn;
-		if (type >= MTRR_NUM_TYPES)
-			continue;
-		if (!size)
-			type = MTRR_NUM_TYPES;
-		if (type == MTRR_TYPE_WRPROT)
-			type = MTRR_TYPE_UNCACHABLE;
-		num[type]++;
-	}
-
-	/* check if we got UC entries */
-	if (!num[MTRR_TYPE_UNCACHABLE])
-		return 0;
-
-	/* check if we only had WB and UC */
-	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-		num_var_ranges - num[MTRR_NUM_TYPES])
-		return 0;
-
-	return 1;
-}
-
-static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
-					 unsigned long extra_remove_base,
-					 unsigned long extra_remove_size,
-					 int i)
-{
-	int num_reg;
-	static struct res_range range_new[RANGE_NUM];
-	static int nr_range_new;
-	unsigned long range_sums_new;
-
-	/* convert ranges to var ranges state */
-	num_reg = x86_setup_var_mtrrs(range, nr_range,
-						chunk_size, gran_size);
-
-	/* we got new setting in range_state, check it */
-	memset(range_new, 0, sizeof(range_new));
-	nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
-				extra_remove_base, extra_remove_size);
-	range_sums_new = sum_ranges(range_new, nr_range_new);
-
-	result[i].chunk_sizek = chunk_size >> 10;
-	result[i].gran_sizek = gran_size >> 10;
-	result[i].num_reg = num_reg;
-	if (range_sums < range_sums_new) {
-		result[i].lose_cover_sizek =
-			(range_sums_new - range_sums) << PSHIFT;
-		result[i].bad = 1;
-	} else
-		result[i].lose_cover_sizek =
-			(range_sums - range_sums_new) << PSHIFT;
-
-	/* double check it */
-	if (!result[i].bad && !result[i].lose_cover_sizek) {
-		if (nr_range_new != nr_range ||
-			memcmp(range, range_new, sizeof(range)))
-				result[i].bad = 1;
-	}
-
-	if (!result[i].bad && (range_sums - range_sums_new <
-				min_loss_pfn[num_reg])) {
-		min_loss_pfn[num_reg] =
-			range_sums - range_sums_new;
-	}
-}
-
-static void __init mtrr_print_out_one_result(int i)
-{
-	char gran_factor, chunk_factor, lose_factor;
-	unsigned long gran_base, chunk_base, lose_base;
-
-	gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
-	chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
-	lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
-	printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
-			result[i].bad ? "*BAD*" : " ",
-			gran_base, gran_factor, chunk_base, chunk_factor);
-	printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
-			result[i].num_reg, result[i].bad ? "-" : "",
-			lose_base, lose_factor);
-}
-
-static int __init mtrr_search_optimal_index(void)
-{
-	int i;
-	int num_reg_good;
-	int index_good;
-
-	if (nr_mtrr_spare_reg >= num_var_ranges)
-		nr_mtrr_spare_reg = num_var_ranges - 1;
-	num_reg_good = -1;
-	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
-		if (!min_loss_pfn[i])
-			num_reg_good = i;
-	}
-
-	index_good = -1;
-	if (num_reg_good != -1) {
-		for (i = 0; i < NUM_RESULT; i++) {
-			if (!result[i].bad &&
-			    result[i].num_reg == num_reg_good &&
-			    !result[i].lose_cover_sizek) {
-				index_good = i;
-				break;
-			}
-		}
-	}
-
-	return index_good;
-}
-
-
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-	unsigned long extra_remove_base, extra_remove_size;
-	unsigned long base, size, def, dummy;
-	mtrr_type type;
-	u64 chunk_size, gran_size;
-	int index_good;
-	int i;
-
-	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
-		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
-	def &= 0xff;
-	if (def != MTRR_TYPE_UNCACHABLE)
-		return 0;
-
-	/* get it and store it aside */
-	memset(range_state, 0, sizeof(range_state));
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		range_state[i].base_pfn = base;
-		range_state[i].size_pfn = size;
-		range_state[i].type = type;
-	}
-
-	/* check if we need handle it and can handle it */
-	if (!mtrr_need_cleanup())
-		return 0;
-
-	/* print original var MTRRs at first, for debugging: */
-	printk(KERN_DEBUG "original variable MTRRs\n");
-	print_out_mtrr_range_state();
-
-	memset(range, 0, sizeof(range));
-	extra_remove_size = 0;
-	extra_remove_base = 1 << (32 - PAGE_SHIFT);
-	if (mtrr_tom2)
-		extra_remove_size =
-			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
-	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
-					  extra_remove_size);
-	/*
-	 * [0, 1M) should always be coverred by var mtrr with WB
-	 * and fixed mtrrs should take effective before var mtrr for it
-	 */
-	nr_range = add_range_with_merge(range, nr_range, 0,
-					(1ULL<<(20 - PAGE_SHIFT)) - 1);
-	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
-	range_sums = sum_ranges(range, nr_range);
-	printk(KERN_INFO "total RAM coverred: %ldM\n",
-	       range_sums >> (20 - PAGE_SHIFT));
-
-	if (mtrr_chunk_size && mtrr_gran_size) {
-		i = 0;
-		mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
-				      extra_remove_base, extra_remove_size, i);
-
-		mtrr_print_out_one_result(i);
-
-		if (!result[i].bad) {
-			set_var_mtrr_all(address_bits);
-			printk(KERN_DEBUG "New variable MTRRs\n");
-			print_out_mtrr_range_state();
-			return 1;
-		}
-		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-		       "will find optimal one\n");
-	}
-
-	i = 0;
-	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
-	memset(result, 0, sizeof(result));
-	for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
-
-		for (chunk_size = gran_size; chunk_size < (1ULL<<32);
-		     chunk_size <<= 1) {
-
-			if (i >= NUM_RESULT)
-				continue;
-
-			mtrr_calc_range_state(chunk_size, gran_size,
-				      extra_remove_base, extra_remove_size, i);
-			if (debug_print) {
-				mtrr_print_out_one_result(i);
-				printk(KERN_INFO "\n");
-			}
-
-			i++;
-		}
-	}
-
-	/* try to find the optimal index */
-	index_good = mtrr_search_optimal_index();
-
-	if (index_good != -1) {
-		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
-		i = index_good;
-		mtrr_print_out_one_result(i);
-
-		/* convert ranges to var ranges state */
-		chunk_size = result[i].chunk_sizek;
-		chunk_size <<= 10;
-		gran_size = result[i].gran_sizek;
-		gran_size <<= 10;
-		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
-		set_var_mtrr_all(address_bits);
-		printk(KERN_DEBUG "New variable MTRRs\n");
-		print_out_mtrr_range_state();
-		return 1;
-	} else {
-		/* print out all */
-		for (i = 0; i < NUM_RESULT; i++)
-			mtrr_print_out_one_result(i);
-	}
-
-	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
-
-	return 0;
-}
-#else
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-	return 0;
-}
-#endif
-
-static int __initdata changed_by_mtrr_cleanup;
-
-static int disable_mtrr_trim;
-
-static int __init disable_mtrr_trim_setup(char *str)
-{
-	disable_mtrr_trim = 1;
-	return 0;
-}
-early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
-
-/*
- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
- * for memory >4GB. Check for that here.
- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
- * apply to are wrong, but so far we don't know of any such case in the wild.
- */
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
-
-int __init amd_special_default_mtrr(void)
-{
-	u32 l, h;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-		return 0;
-	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
-		return 0;
-	/* In case some hypervisor doesn't pass SYSCFG through */
-	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
-		return 0;
-	/*
-	 * Memory between 4GB and top of mem is forced WB by this magic bit.
-	 * Reserved before K8RevF, but should be zero there.
-	 */
-	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
-		 (Tom2Enabled | Tom2ForceMemTypeWB))
-		return 1;
-	return 0;
-}
-
-static u64 __init real_trim_memory(unsigned long start_pfn,
-				   unsigned long limit_pfn)
-{
-	u64 trim_start, trim_size;
-	trim_start = start_pfn;
-	trim_start <<= PAGE_SHIFT;
-	trim_size = limit_pfn;
-	trim_size <<= PAGE_SHIFT;
-	trim_size -= trim_start;
-
-	return e820_update_range(trim_start, trim_size, E820_RAM,
-				E820_RESERVED);
-}
-/**
- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
- * @end_pfn: ending page frame number
- *
- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
- * memory configurations.  This routine checks that the highest MTRR matches
- * the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
- * memory off the end by adjusting end_pfn, removing it from the kernel's
- * allocation pools, warning the user with an obnoxious message.
- */
-int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
-{
-	unsigned long i, base, size, highest_pfn = 0, def, dummy;
-	mtrr_type type;
-	u64 total_trim_size;
-
-	/* extra one for all 0 */
-	int num[MTRR_NUM_TYPES + 1];
-	/*
-	 * Make sure we only trim uncachable memory on machines that
-	 * support the Intel MTRR architecture:
-	 */
-	if (!is_cpu(INTEL) || disable_mtrr_trim)
-		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
-	def &= 0xff;
-	if (def != MTRR_TYPE_UNCACHABLE)
-		return 0;
-
-	/* get it and store it aside */
-	memset(range_state, 0, sizeof(range_state));
-	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
-		range_state[i].base_pfn = base;
-		range_state[i].size_pfn = size;
-		range_state[i].type = type;
-	}
-
-	/* Find highest cached pfn */
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type != MTRR_TYPE_WRBACK)
-			continue;
-		base = range_state[i].base_pfn;
-		size = range_state[i].size_pfn;
-		if (highest_pfn < base + size)
-			highest_pfn = base + size;
-	}
-
-	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
-	if (!highest_pfn) {
-		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
-		return 0;
-	}
-
-	/* check entries number */
-	memset(num, 0, sizeof(num));
-	for (i = 0; i < num_var_ranges; i++) {
-		type = range_state[i].type;
-		if (type >= MTRR_NUM_TYPES)
-			continue;
-		size = range_state[i].size_pfn;
-		if (!size)
-			type = MTRR_NUM_TYPES;
-		num[type]++;
-	}
-
-	/* no entry for WB? */
-	if (!num[MTRR_TYPE_WRBACK])
-		return 0;
-
-	/* check if we only had WB and UC */
-	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-		num_var_ranges - num[MTRR_NUM_TYPES])
-		return 0;
-
-	memset(range, 0, sizeof(range));
-	nr_range = 0;
-	if (mtrr_tom2) {
-		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
-		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
-		if (highest_pfn < range[nr_range].end + 1)
-			highest_pfn = range[nr_range].end + 1;
-		nr_range++;
-	}
-	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
-
-	total_trim_size = 0;
-	/* check the head */
-	if (range[0].start)
-		total_trim_size += real_trim_memory(0, range[0].start);
-	/* check the holes */
-	for (i = 0; i < nr_range - 1; i++) {
-		if (range[i].end + 1 < range[i+1].start)
-			total_trim_size += real_trim_memory(range[i].end + 1,
-							    range[i+1].start);
-	}
-	/* check the top */
-	i = nr_range - 1;
-	if (range[i].end + 1 < end_pfn)
-		total_trim_size += real_trim_memory(range[i].end + 1,
-							 end_pfn);
-
-	if (total_trim_size) {
-		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %lluMB of RAM.\n",
-			total_trim_size >> 20);
-
-		if (!changed_by_mtrr_cleanup)
-			WARN_ON(1);
-
-		printk(KERN_INFO "update e820 for mtrr\n");
-		update_e820();
-
-		return 1;
-	}
-
-	return 0;
-}
+int __initdata changed_by_mtrr_cleanup;
 
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6..6710e93021a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -88,3 +88,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
 int amd_init_mtrr(void);
 int cyrix_init_mtrr(void);
 int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
-- 
cgit v1.2.3


From 91219bcbdcccc1686b0ecce09e28825c93619c07 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 12 Mar 2009 02:37:00 +0530
Subject: x86: cpu_debug add write support for MSRs

Supported write flag for registers.
currently write is enabled only for PMC MSR.

[root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value
0x0

[root@ht]# echo 1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value
[root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value
0x4d2

[root@ht]# echo 0x1234 > /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value
[root@ht]# cat /sys/kernel/debug/x86/cpu/cpu1/pmc/0x300/value
0x1234

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpu_debug.c | 118 +++++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 9abbcbd933c..21c0cf8ced1 100755
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -11,6 +11,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -40,41 +41,41 @@ static DEFINE_MUTEX(cpu_debug_lock);
 static struct dentry *cpu_debugfs_dir;
 
 static struct cpu_debug_base cpu_base[] = {
-	{ "mc",		CPU_MC		},	/* Machine Check	*/
-	{ "monitor",	CPU_MONITOR	},	/* Monitor		*/
-	{ "time",	CPU_TIME	},	/* Time			*/
-	{ "pmc",	CPU_PMC		},	/* Performance Monitor	*/
-	{ "platform",	CPU_PLATFORM	},	/* Platform		*/
-	{ "apic",	CPU_APIC	},	/* APIC			*/
-	{ "poweron",	CPU_POWERON	},	/* Power-on		*/
-	{ "control",	CPU_CONTROL	},	/* Control		*/
-	{ "features",	CPU_FEATURES	},	/* Features control	*/
-	{ "lastbranch",	CPU_LBRANCH	},	/* Last Branch		*/
-	{ "bios",	CPU_BIOS	},	/* BIOS			*/
-	{ "freq",	CPU_FREQ	},	/* Frequency		*/
-	{ "mtrr",	CPU_MTRR	},	/* MTRR			*/
-	{ "perf",	CPU_PERF	},	/* Performance		*/
-	{ "cache",	CPU_CACHE	},	/* Cache		*/
-	{ "sysenter",	CPU_SYSENTER	},	/* Sysenter		*/
-	{ "therm",	CPU_THERM	},	/* Thermal		*/
-	{ "misc",	CPU_MISC	},	/* Miscellaneous	*/
-	{ "debug",	CPU_DEBUG	},	/* Debug		*/
-	{ "pat",	CPU_PAT		},	/* PAT			*/
-	{ "vmx",	CPU_VMX		},	/* VMX			*/
-	{ "call",	CPU_CALL	},	/* System Call		*/
-	{ "base",	CPU_BASE	},	/* BASE Address		*/
-	{ "smm",	CPU_SMM		},	/* System mgmt mode	*/
-	{ "svm",	CPU_SVM		},	/*Secure Virtial Machine*/
-	{ "osvm",	CPU_OSVM	},	/* OS-Visible Workaround*/
-	{ "tss",	CPU_TSS		},	/* Task Stack Segment	*/
-	{ "cr",		CPU_CR		},	/* Control Registers	*/
-	{ "dt",		CPU_DT		},	/* Descriptor Table	*/
-	{ "registers",	CPU_REG_ALL	},	/* Select all Registers	*/
+	{ "mc",		CPU_MC,		0	},
+	{ "monitor",	CPU_MONITOR,	0	},
+	{ "time",	CPU_TIME,	0	},
+	{ "pmc",	CPU_PMC,	1	},
+	{ "platform",	CPU_PLATFORM,	0	},
+	{ "apic",	CPU_APIC,	0	},
+	{ "poweron",	CPU_POWERON,	0	},
+	{ "control",	CPU_CONTROL,	0	},
+	{ "features",	CPU_FEATURES,	0	},
+	{ "lastbranch",	CPU_LBRANCH,	0	},
+	{ "bios",	CPU_BIOS,	0	},
+	{ "freq",	CPU_FREQ,	0	},
+	{ "mtrr",	CPU_MTRR,	0	},
+	{ "perf",	CPU_PERF,	0	},
+	{ "cache",	CPU_CACHE,	0	},
+	{ "sysenter",	CPU_SYSENTER,	0	},
+	{ "therm",	CPU_THERM,	0	},
+	{ "misc",	CPU_MISC,	0	},
+	{ "debug",	CPU_DEBUG,	0	},
+	{ "pat",	CPU_PAT,	0	},
+	{ "vmx",	CPU_VMX,	0	},
+	{ "call",	CPU_CALL,	0	},
+	{ "base",	CPU_BASE,	0	},
+	{ "smm",	CPU_SMM,	0	},
+	{ "svm",	CPU_SVM,	0	},
+	{ "osvm",	CPU_OSVM,	0	},
+	{ "tss",	CPU_TSS,	0	},
+	{ "cr",		CPU_CR,		0	},
+	{ "dt",		CPU_DT,		0	},
+	{ "registers",	CPU_REG_ALL,	0	},
 };
 
 static struct cpu_file_base cpu_file[] = {
-	{ "index",	CPU_REG_ALL	},	/* index		*/
-	{ "value",	CPU_REG_ALL	},	/* value		*/
+	{ "index",	CPU_REG_ALL,	0	},
+	{ "value",	CPU_REG_ALL,	1	},
 };
 
 /* Intel Registers Range */
@@ -608,9 +609,62 @@ static int cpu_seq_open(struct inode *inode, struct file *file)
 	return err;
 }
 
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+	u32 low, high;
+
+	high = (val >> 32) & 0xffffffff;
+	low = val & 0xffffffff;
+
+	if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+		return 0;
+
+	return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+	int ret = -EPERM;
+	u64 val;
+
+	ret = strict_strtoull(buf, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	/* Supporting only MSRs */
+	if (priv->type < CPU_TSS_BIT)
+		return write_msr(priv, val);
+
+	return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+			     size_t count, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct cpu_private *priv = seq->private;
+	char buf[19];
+
+	if ((priv == NULL) || (count >= sizeof(buf)))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+		if (!write_cpu_register(priv, buf))
+			return count;
+
+	return -EACCES;
+}
+
 static const struct file_operations cpu_fops = {
+	.owner		= THIS_MODULE,
 	.open		= cpu_seq_open,
 	.read		= seq_read,
+	.write		= cpu_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
-- 
cgit v1.2.3


From 773e673de27297d07d852e7e9bfd1a695cae1da2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 12 Mar 2009 21:35:18 -0700
Subject: x86: fix e820_update_range()

Impact: fix left range size on head

| commit 5c0e6f035df983210e4d22213aed624ced502d3d
|    x86: fix code paths used by update_mptable
|    Impact: fix crashes under Xen due to unrobust e820 code

fixes one e820 bug, but introduces another bug.

Need to update size for left range at first in case it is header.

also add __e820_add_region take more parameter.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: jbeulich@novell.com
LKML-Reference: <49B9E286.502@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3cf6681ac80..95b81c18b6b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,25 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 /*
  * Add a memory region to the kernel e820 map.
  */
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+					 int type)
 {
-	int x = e820.nr_map;
+	int x = e820x->nr_map;
 
-	if (x == ARRAY_SIZE(e820.map)) {
+	if (x == ARRAY_SIZE(e820x->map)) {
 		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
 		return;
 	}
 
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
+	e820x->map[x].addr = start;
+	e820x->map[x].size = size;
+	e820x->map[x].type = type;
+	e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+	__e820_add_region(&e820, start, size, type);
 }
 
 void __init e820_print_map(char *who)
@@ -417,11 +423,11 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
 	return __append_e820_map(biosmap, nr_map);
 }
 
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 					u64 size, unsigned old_type,
 					unsigned new_type)
 {
-	unsigned int i, x;
+	unsigned int i;
 	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
@@ -447,22 +453,19 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 		if (final_start >= final_end)
 			continue;
 
-		x = e820x->nr_map;
-		if (x == ARRAY_SIZE(e820x->map)) {
-			printk(KERN_ERR "Too many memory map entries!\n");
-			break;
-		}
-		e820x->map[x].addr = final_start;
-		e820x->map[x].size = final_end - final_start;
-		e820x->map[x].type = new_type;
-		e820x->nr_map++;
+		__e820_add_region(e820x, final_start, final_end - final_start,
+				  new_type);
 
 		real_updated_size += final_end - final_start;
 
+		/*
+		 * left range could be head or tail, so need to update
+		 * size at first.
+		 */
+		ei->size -= final_end - final_start;
 		if (ei->addr < final_start)
 			continue;
 		ei->addr = final_end;
-		ei->size -= final_end - final_start;
 	}
 	return real_updated_size;
 }
@@ -470,13 +473,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 			     unsigned new_type)
 {
-	return e820_update_range_map(&e820, start, size, old_type, new_type);
+	return __e820_update_range(&e820, start, size, old_type, new_type);
 }
 
 static u64 __init e820_update_range_saved(u64 start, u64 size,
 					  unsigned old_type, unsigned new_type)
 {
-	return e820_update_range_map(&e820_saved, start, size, old_type,
+	return __e820_update_range(&e820_saved, start, size, old_type,
 				     new_type);
 }
 
-- 
cgit v1.2.3


From 3ff42da5048649503e343a32be37b14a6a4e8aaf Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 12 Mar 2009 17:39:37 +0100
Subject: x86: mtrr: don't modify RdDram/WrDram bits of fixed MTRRs

Impact: bug fix + BIOS workaround

BIOS is expected to clear the SYSCFG[MtrrFixDramModEn] on AMD CPUs
after fixed MTRRs are configured.

Some BIOSes do not clear SYSCFG[MtrrFixDramModEn] on BP (and on APs).

This can lead to obfuscation in Linux when this bit is not cleared on
BP but cleared on APs. A consequence of this is that the saved
fixed-MTRR state (from BP) differs from the fixed-MTRRs of APs --
because RdDram/WrDram bits are read as zero when
SYSCFG[MtrrFixDramModEn] is cleared -- and Linux tries to sync
fixed-MTRR state from BP to AP. This implies that Linux sets
SYSCFG[MtrrFixDramEn] and activates those bits.

More important is that (some) systems change these bits in SMM when
ACPI is enabled. Hence it is racy if Linux modifies RdMem/WrMem bits,
too.

(1) The patch modifies an old fix from Bernhard Kaindl to get
    suspend/resume working on some Acer Laptops. Bernhard's patch
    tried to sync RdMem/WrMem bits of fixed MTRR registers and that
    helped on those old Laptops. (Don't ask me why -- can't test it
    myself). But this old problem was not the motivation for the
    patch. (See http://lkml.org/lkml/2007/4/3/110)

(2) The more important effect is to fix issues on some more current systems.

    On those systems Linux panics or just freezes, see

    http://bugzilla.kernel.org/show_bug.cgi?id=11541
    (and also duplicates of this bug:
    http://bugzilla.kernel.org/show_bug.cgi?id=11737
    http://bugzilla.kernel.org/show_bug.cgi?id=11714)

    The affected systems boot only using acpi=ht, acpi=off or
    when the kernel is built with CONFIG_MTRR=n.

    The acpi options prevent full enablement of ACPI.  Obviously when
    ACPI is enabled the BIOS/SMM modfies RdMem/WrMem bits.  When
    CONFIG_MTRR=y Linux also accesses and modifies those bits when it
    needs to sync fixed-MTRRs across cores (Bernhard's fix, see (1)).
    How do you synchronize that? You can't. As a consequence Linux
    shouldn't touch those bits at all (Rationale are AMD's BKDGs which
    recommend to clear the bit that makes RdMem/WrMem accessible).
    This is the purpose of this patch. And (so far) this suffices to
    fix (1) and (2).

I suggest not to touch RdDram/WrDram bits of fixed-MTRRs and
SYSCFG[MtrrFixDramEn] and to clear SYSCFG[MtrrFixDramModEn] as
suggested by AMD K8, and AMD family 10h/11h BKDGs.
BIOS is expected to do this anyway. This should avoid that
Linux and SMM tread on each other's toes ...

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: trenn@suse.de
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20090312163937.GH20716@alberich.amd.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 51 ++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 96440352010..950c434f793 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,6 +33,32 @@ u64 mtrr_tom2;
 struct mtrr_state_type mtrr_state = {};
 EXPORT_SYMBOL_GPL(mtrr_state);
 
+/**
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
+{
+	u32 lo, hi;
+
+	if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+	      (boot_cpu_data.x86 >= 0x0f)))
+		return;
+
+	rdmsr(MSR_K8_SYSCFG, lo, hi);
+	if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+		printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+		       " not cleared by BIOS, clearing this bit\n",
+		       smp_processor_id());
+		lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+		mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+	}
+}
+
 /*
  * Returns the effective MTRR type for the region
  * Error returns:
@@ -166,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
 	unsigned int *p = (unsigned int *) frs;
 	int i;
 
+	k8_check_syscfg_dram_mod_en();
+
 	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
@@ -305,28 +333,11 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 			smp_processor_id(), msr, a, b);
 }
 
-/**
- * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
- * see AMD publication no. 24593, chapter 3.2.1 for more information
- */
-static inline void k8_enable_fixed_iorrs(void)
-{
-	unsigned lo, hi;
-
-	rdmsr(MSR_K8_SYSCFG, lo, hi);
-	mtrr_wrmsr(MSR_K8_SYSCFG, lo
-				| K8_MTRRFIXRANGE_DRAM_ENABLE
-				| K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
-}
-
 /**
  * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
  * @msr: MSR address of the MTTR which should be checked and updated
  * @changed: pointer which indicates whether the MTRR needed to be changed
  * @msrwords: pointer to the MSR values which the MSR should have
- *
- * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
  */
 static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 {
@@ -335,10 +346,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 	rdmsr(msr, lo, hi);
 
 	if (lo != msrwords[0] || hi != msrwords[1]) {
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		    (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
-		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
-			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
 		*changed = true;
 	}
@@ -426,6 +433,8 @@ static int set_fixed_ranges(mtrr_type * frs)
 	bool changed = false;
 	int block=-1, range;
 
+	k8_check_syscfg_dram_mod_en();
+
 	while (fixed_range_blocks[++block].ranges)
 	    for (range=0; range < fixed_range_blocks[block].ranges; range++)
 		set_fixed_range(fixed_range_blocks[block].base_msr + range,
-- 
cgit v1.2.3


From 5a8ac9d28dae5330c70562c7d7785f5104059c17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Am=C3=A9rico=20Wang?= <xiyou.wangcong@gmail.com>
Date: Fri, 13 Mar 2009 15:56:58 +0800
Subject: x86: ptrace, bts: fix an unreachable statement

Commit c2724775ce57c98b8af9694857b941dc61056516 put a statement
after return, which makes that statement unreachable.

Move that statement before return.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313075622.GB8933@hack>
Cc: <stable@kernel.org> # .29 only
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c1..19378715f41 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
 		if (!cfg.signal)
 			return -EINVAL;
 
-		return -EOPNOTSUPP;
-
 		child->thread.bts_ovfl_signal = cfg.signal;
+		return -EOPNOTSUPP;
 	}
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-- 
cgit v1.2.3


From 9766cdbcb260389669e9679b2aa87c11832f479f Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 14 Mar 2009 11:19:49 +0530
Subject: x86: cpu/common.c cleanups

- fix various style problems
 - declare varibles before they get used
 - introduced clear_all_debug_regs
 - fix header files issues

LKML-Reference: <1237009789.4387.2.camel@localhost.localdomain>
Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 129 ++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 64 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 826d5c87627..cad6878c88d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,52 +1,52 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
+#include <linux/topology.h>
 #include <linux/bootmem.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
 #include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
 #include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
 #include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
 #include <asm/mce.h>
+#include <asm/msr.h>
 #include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
 #include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/cpumask.h>
-#include <asm/apic.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
 #endif
 
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-#include <asm/stackprotector.h>
-
 #include "cpu.h"
 
 #ifdef CONFIG_X86_64
 
 /* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_callin_mask;
-cpumask_var_t cpu_callout_mask;
 cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
 
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
@@ -62,10 +62,10 @@ void __init setup_cpu_local_masks(void)
 
 #else /* CONFIG_X86_32 */
 
-cpumask_t cpu_callin_map;
+cpumask_t cpu_sibling_setup_map;
 cpumask_t cpu_callout_map;
 cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
+cpumask_t cpu_callin_map;
 
 #endif /* CONFIG_X86_32 */
 
@@ -79,7 +79,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	 * IRET will check the segment types  kkeil 2000/10/28
 	 * Also sysret mandates a special GDT layout
 	 *
-	 * The TLS descriptors are currently at a different place compared to i386.
+	 * TLS descriptors are currently at a different place compared to i386.
 	 * Hopefully nobody expects them at a fixed place (Wine?)
 	 */
 	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
@@ -243,6 +243,7 @@ cpuid_dependent_features[] = {
 static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 {
 	const struct cpuid_dependent_feature *df;
+
 	for (df = cpuid_dependent_features; df->feature; df++) {
 		/*
 		 * Note: cpuid_level is set to -1 if unavailable, but
@@ -364,12 +365,12 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 	   undo that brain damage */
 	p = q = &c->x86_model_id[0];
 	while (*p == ' ')
-	     p++;
+		p++;
 	if (p != q) {
-	     while (*p)
-		  *q++ = *p++;
-	     while (q <= &c->x86_model_id[48])
-		  *q++ = '\0';	/* Zero-pad the rest */
+		while (*p)
+			*q++ = *p++;
+		while (q <= &c->x86_model_id[48])
+			*q++ = '\0';	/* Zero-pad the rest */
 	}
 }
 
@@ -441,14 +442,15 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 	} else if (smp_num_siblings > 1) {
 
 		if (smp_num_siblings > nr_cpu_ids) {
-			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-					smp_num_siblings);
+			pr_warning("CPU: Unsupported number of siblings %d",
+				   smp_num_siblings);
 			smp_num_siblings = 1;
 			return;
 		}
 
 		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+		c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
+						    index_msb);
 
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -491,7 +493,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 
 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+		printk(KERN_ERR "CPU: vendor_id '%s'"
+			"unknown, using generic init.\n", v);
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
 
@@ -637,7 +640,7 @@ void __init early_cpu_init(void)
 	struct cpu_dev **cdev;
 	int count = 0;
 
-	printk("KERNEL supported cpus:\n");
+	printk(KERN_INFO "KERNEL supported cpus:\n");
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
 		struct cpu_dev *cpudev = *cdev;
 		unsigned int j;
@@ -650,7 +653,7 @@ void __init early_cpu_init(void)
 		for (j = 0; j < 2; j++) {
 			if (!cpudev->c_ident[j])
 				continue;
-			printk("  %s %s\n", cpudev->c_vendor,
+			printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
 				cpudev->c_ident[j]);
 		}
 	}
@@ -952,8 +955,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
 	__aligned(PAGE_SIZE);
 
-extern asmlinkage void ignore_sysret(void);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
@@ -999,6 +1000,22 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 }
 #endif	/* x86_64 */
 
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+
+		set_debugreg(0, i);
+	}
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1098,17 +1115,7 @@ void __cpuinit cpu_init(void)
 		arch_kgdb_ops.correct_hw_break();
 	else
 #endif
-	{
-		/*
-		 * Clear all 6 debug registers:
-		 */
-		set_debugreg(0UL, 0);
-		set_debugreg(0UL, 1);
-		set_debugreg(0UL, 2);
-		set_debugreg(0UL, 3);
-		set_debugreg(0UL, 6);
-		set_debugreg(0UL, 7);
-	}
+		clear_all_debug_regs();
 
 	fpu_init();
 
@@ -1129,7 +1136,8 @@ void __cpuinit cpu_init(void)
 
 	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-		for (;;) local_irq_enable();
+		for (;;)
+			local_irq_enable();
 	}
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1159,13 +1167,7 @@ void __cpuinit cpu_init(void)
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-	/* Clear all 6 debug registers: */
-	set_debugreg(0, 0);
-	set_debugreg(0, 1);
-	set_debugreg(0, 2);
-	set_debugreg(0, 3);
-	set_debugreg(0, 6);
-	set_debugreg(0, 7);
+	clear_all_debug_regs();
 
 	/*
 	 * Force FPU initialization:
@@ -1186,5 +1188,4 @@ void __cpuinit cpu_init(void)
 	xsave_init();
 }
 
-
 #endif
-- 
cgit v1.2.3


From 88200bc28da38bcda1cb1bd218216e83b426d8a8 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 14 Mar 2009 12:08:13 +0530
Subject: x86: entry_32.S fix compile warnings - fix work mask bit width

Fix:

 arch/x86/kernel/entry_32.S:446: Warning: 00000000080001d1 shortened to 00000000000001d1
 arch/x86/kernel/entry_32.S:457: Warning: 000000000800feff shortened to 000000000000feff
 arch/x86/kernel/entry_32.S:527: Warning: 00000000080001d1 shortened to 00000000000001d1
 arch/x86/kernel/entry_32.S:541: Warning: 000000000800feff shortened to 000000000000feff
 arch/x86/kernel/entry_32.S:676: Warning: 0000000008000091 shortened to 0000000000000091

TIF_SYSCALL_FTRACE is 0x08000000 and until now we checked the
first 16 bits of the work mask - bit 27 falls outside of that.

Update the entry_32.S code to check the full 32-bit mask.

[ %cx => %ecx fix from Cyrill Gorcunov <gorcunov@gmail.com> ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "H. Peter Anvin" <hpa@kernel.org>
LKML-Reference: <1237012693.18733.3.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
 
 	GET_THREAD_INFO(%ebp)
 
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
 	cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx
+	testl $_TIF_ALLWORK_MASK, %ecx
 	jne sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
 
 #ifdef CONFIG_AUDITSYSCALL
 sysenter_audit:
-	testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	addl $4,%esp
 	CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
 	jmp sysenter_do_call
 
 sysexit_audit:
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	movl PT_EAX(%esp),%eax	/* reload syscall return value */
 	jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
 					# between sampling and the iret
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
 	jne syscall_exit_work
 
 restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_WORK_SYSCALL_EXIT, %cl
+	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
 	jz work_pending
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
-- 
cgit v1.2.3


From 0f3fa48a7eaf5d1118cfda1650e8c759b2a116e4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Mar 2009 08:46:17 +0100
Subject: x86: cpu/common.c more cleanups

Complete/fix the cleanups of cpu/common.c:

 - fix ugly warning due to asm/topology.h -> linux/topology.h change
 - standardize the style across the file
 - simplify/refactor the code flow where possible

Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
LKML-Reference: <1237009789.4387.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 243 +++++++++++++++++++++++++------------------
 1 file changed, 143 insertions(+), 100 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cad6878c88d..a9e3791ca09 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,4 +1,3 @@
-#include <linux/topology.h>
 #include <linux/bootmem.h>
 #include <linux/linkage.h>
 #include <linux/bitops.h>
@@ -18,6 +17,7 @@
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
+#include <asm/topology.h>
 #include <asm/cpumask.h>
 #include <asm/pgtable.h>
 #include <asm/atomic.h>
@@ -82,45 +82,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	 * TLS descriptors are currently at a different place compared to i386.
 	 * Hopefully nobody expects them at a fixed place (Wine?)
 	 */
-	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+	[GDT_ENTRY_KERNEL32_CS]		= { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS]	= { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00affb00 } } },
 #else
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00cf9a00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00cffa00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff200 } } },
 	/*
 	 * Segments used for calling PnP BIOS have byte granularity.
 	 * They code segments and data segments have fixed 64k limits,
 	 * the transfer segment sizes are set at run time.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS32]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS16]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_DS]		= { { { 0x0000ffff, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS1]		= { { { 0x00000000, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS2]		= { { { 0x00000000, 0x00009200 } } },
 	/*
 	 * The APM segments have byte granularity and their bases
 	 * are set at run time.  All have 64k limits.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE+1]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* data */
-	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+	[GDT_ENTRY_APMBIOS_BASE+2]	= { { { 0x0000ffff, 0x00409200 } } },
 
-	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-	[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x00000000, 0x00c09200 } } },
+	[GDT_ENTRY_PERCPU]		= { { { 0x0000ffff, 0x00cf9200 } } },
 	GDT_STACK_CANARY_INIT
 #endif
 } };
@@ -164,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag)
 	 * the CPUID. Add "volatile" to not allow gcc to
 	 * optimize the subsequent calls to this function.
 	 */
-	asm volatile ("pushfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "movl %0,%1\n\t"
-		      "xorl %2,%0\n\t"
-		      "pushl %0\n\t"
-		      "popfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "popfl\n\t"
+	asm volatile ("pushfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "movl %0, %1	\n\t"
+		      "xorl %2, %0	\n\t"
+		      "pushl %0		\n\t"
+		      "popfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "popfl		\n\t"
+
 		      : "=&r" (f1), "=&r" (f2)
 		      : "ir" (flag));
 
@@ -188,18 +189,22 @@ static int __cpuinit have_cpuid_p(void)
 
 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-		/* Disable processor serial number */
-		unsigned long lo, hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_cpu_cap(c, X86_FEATURE_PN);
-
-		/* Disabling the serial number may affect the cpuid level */
-		c->cpuid_level = cpuid_eax(0);
-	}
+	unsigned long lo, hi;
+
+	if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+		return;
+
+	/* Disable processor serial number: */
+
+	rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+	lo |= 0x200000;
+	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	clear_cpu_cap(c, X86_FEATURE_PN);
+
+	/* Disabling the serial number may affect the cpuid level */
+	c->cpuid_level = cpuid_eax(0);
 }
 
 static int __init x86_serial_nr_setup(char *s)
@@ -232,6 +237,7 @@ struct cpuid_dependent_feature {
 	u32 feature;
 	u32 level;
 };
+
 static const struct cpuid_dependent_feature __cpuinitconst
 cpuid_dependent_features[] = {
 	{ X86_FEATURE_MWAIT,		0x00000005 },
@@ -245,6 +251,9 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 	const struct cpuid_dependent_feature *df;
 
 	for (df = cpuid_dependent_features; df->feature; df++) {
+
+		if (!cpu_has(c, df->feature))
+			continue;
 		/*
 		 * Note: cpuid_level is set to -1 if unavailable, but
 		 * extended_extended_level is set to 0 if unavailable
@@ -252,26 +261,26 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 		 * when signed; hence the weird messing around with
 		 * signs here...
 		 */
-		if (cpu_has(c, df->feature) &&
-		    ((s32)df->level < 0 ?
+		if (!((s32)df->level < 0 ?
 		     (u32)df->level > (u32)c->extended_cpuid_level :
-		     (s32)df->level > (s32)c->cpuid_level)) {
-			clear_cpu_cap(c, df->feature);
-			if (warn)
-				printk(KERN_WARNING
-				       "CPU: CPU feature %s disabled "
-				       "due to lack of CPUID level 0x%x\n",
-				       x86_cap_flags[df->feature],
-				       df->level);
-		}
+		     (s32)df->level > (s32)c->cpuid_level))
+			continue;
+
+		clear_cpu_cap(c, df->feature);
+		if (!warn)
+			continue;
+
+		printk(KERN_WARNING
+		       "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+				x86_cap_flags[df->feature], df->level);
 	}
 }
 
 /*
  * Naming convention should be: <Name> [(<Codename>)]
  * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
  */
 
 /* Look up CPU names by table lookup. */
@@ -308,8 +317,10 @@ void load_percpu_segment(int cpu)
 	load_stack_canary_segment();
 }
 
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
 void switch_to_new_gdt(int cpu)
 {
 	struct desc_ptr gdt_descr;
@@ -355,14 +366,16 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
 
-	v = (unsigned int *) c->x86_model_id;
+	v = (unsigned int *)c->x86_model_id;
 	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
 	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/* Intel chips right-justify this string for some dumb reason;
-	   undo that brain damage */
+	/*
+	 * Intel chips right-justify this string for some dumb reason;
+	 * undo that brain damage:
+	 */
 	p = q = &c->x86_model_id[0];
 	while (*p == ' ')
 		p++;
@@ -439,28 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
+		goto out;
+	}
 
-		if (smp_num_siblings > nr_cpu_ids) {
-			pr_warning("CPU: Unsupported number of siblings %d",
-				   smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
+	if (smp_num_siblings <= 1)
+		goto out;
 
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
-						    index_msb);
+	if (smp_num_siblings > nr_cpu_ids) {
+		pr_warning("CPU: Unsupported number of siblings %d",
+			   smp_num_siblings);
+		smp_num_siblings = 1;
+		return;
+	}
 
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+	index_msb = get_count_order(smp_num_siblings);
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
 
-		index_msb = get_count_order(smp_num_siblings);
+	smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
-		core_bits = get_count_order(c->x86_max_cores);
+	index_msb = get_count_order(smp_num_siblings);
 
-		c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-					       ((1 << core_bits) - 1);
-	}
+	core_bits = get_count_order(c->x86_max_cores);
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+				       ((1 << core_bits) - 1);
 
 out:
 	if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -475,8 +490,8 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
-	int i;
 	static int printed;
+	int i;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
 		if (!cpu_devs[i])
@@ -485,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
 		    (cpu_devs[i]->c_ident[1] &&
 		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
 			this_cpu = cpu_devs[i];
 			c->x86_vendor = this_cpu->c_x86_vendor;
 			return;
@@ -493,8 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 
 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: vendor_id '%s'"
-			"unknown, using generic init.\n", v);
+		printk(KERN_ERR
+		    "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
 
@@ -514,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
+
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
 		c->x86 = (tfms >> 8) & 0xf;
 		c->x86_model = (tfms >> 4) & 0xf;
 		c->x86_mask = tfms & 0xf;
+
 		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
 		if (cap0 & (1<<19)) {
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 			c->x86_cache_alignment = c->x86_clflush_size;
@@ -537,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 capability, excap;
+
 		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 		c->x86_capability[0] = capability;
 		c->x86_capability[4] = excap;
@@ -545,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
+
 	if ((xlvl & 0xffff0000) == 0x80000000) {
 		if (xlvl >= 0x80000001) {
 			c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -762,8 +784,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	squash_the_stupid_serial_number(c);
 
 	/*
-	 * The vendor-specific functions might have changed features.  Now
-	 * we do "generic changes."
+	 * The vendor-specific functions might have changed features.
+	 * Now we do "generic changes."
 	 */
 
 	/* Filter out anything that depends on CPUID levels we don't have */
@@ -846,8 +868,8 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 }
 
 struct msr_range {
-	unsigned min;
-	unsigned max;
+	unsigned	min;
+	unsigned	max;
 };
 
 static struct msr_range msr_range_array[] __cpuinitdata = {
@@ -859,14 +881,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
 
 static void __cpuinit print_cpu_msr(void)
 {
+	unsigned index_min, index_max;
 	unsigned index;
 	u64 val;
 	int i;
-	unsigned index_min, index_max;
 
 	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
 		index_min = msr_range_array[i].min;
 		index_max = msr_range_array[i].max;
+
 		for (index = index_min; index < index_max; index++) {
 			if (rdmsrl_amd_safe(index, &val))
 				continue;
@@ -876,6 +899,7 @@ static void __cpuinit print_cpu_msr(void)
 }
 
 static int show_msr __cpuinitdata;
+
 static __init int setup_show_msr(char *arg)
 {
 	int num;
@@ -899,10 +923,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	char *vendor = NULL;
 
-	if (c->x86_vendor < X86_VENDOR_NUM)
+	if (c->x86_vendor < X86_VENDOR_NUM) {
 		vendor = this_cpu->c_vendor;
-	else if (c->cpuid_level >= 0)
-		vendor = c->x86_vendor_id;
+	} else {
+		if (c->cpuid_level >= 0)
+			vendor = c->x86_vendor_id;
+	}
 
 	if (vendor && !strstr(c->x86_model_id, vendor))
 		printk(KERN_CONT "%s ", vendor);
@@ -929,10 +955,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 static __init int setup_disablecpuid(char *arg)
 {
 	int bit;
+
 	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
 		setup_clear_cpu_cap(bit);
 	else
 		return 0;
+
 	return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);
@@ -942,6 +970,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE);
+
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
 	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
 
@@ -951,6 +980,17 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+};
+
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
 	__aligned(PAGE_SIZE);
@@ -984,7 +1024,7 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
-#else	/* x86_64 */
+#else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -996,9 +1036,10 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 	memset(regs, 0, sizeof(struct pt_regs));
 	regs->fs = __KERNEL_PERCPU;
 	regs->gs = __KERNEL_STACK_CANARY;
+
 	return regs;
 }
-#endif	/* x86_64 */
+#endif	/* CONFIG_X86_64 */
 
 /*
  * Clear all 6 debug registers:
@@ -1024,15 +1065,20 @@ static void clear_all_debug_regs(void)
  * A lot of state is already set up in PDA init for 64 bit
  */
 #ifdef CONFIG_X86_64
+
 void __cpuinit cpu_init(void)
 {
-	int cpu = stack_smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v;
+	struct orig_ist *orig_ist;
 	struct task_struct *me;
+	struct tss_struct *t;
+	unsigned long v;
+	int cpu;
 	int i;
 
+	cpu = stack_smp_processor_id();
+	t = &per_cpu(init_tss, cpu);
+	orig_ist = &per_cpu(orig_ist, cpu);
+
 #ifdef CONFIG_NUMA
 	if (cpu != 0 && percpu_read(node_number) == 0 &&
 	    cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1073,19 +1119,17 @@ void __cpuinit cpu_init(void)
 	 * set up and load the per-CPU TSS
 	 */
 	if (!orig_ist->ist[0]) {
-		static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-		  [DEBUG_STACK - 1] = DEBUG_STKSZ
-		};
 		char *estacks = per_cpu(exception_stacks, cpu);
+
 		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-			estacks += sizes[v];
+			estacks += exception_stack_sizes[v];
 			orig_ist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
 		}
 	}
 
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
 	/*
 	 * <= is required because the CPU will access up to
 	 * 8 bits beyond the end of the IO permission bitmap.
@@ -1187,5 +1231,4 @@ void __cpuinit cpu_init(void)
 
 	xsave_init();
 }
-
 #endif
-- 
cgit v1.2.3


From 78a8b35bc7abf8b8333d6f625e08c0f7cc1c3742 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 12 Mar 2009 22:36:01 -0700
Subject: x86: make e820_update_range() handle small range update

Impact: enhance e820 code to handle more cases

Try to handle new range which could be covered by one entry.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: jbeulich@novell.com
LKML-Reference: <49B9F0C1.10402@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 95b81c18b6b..0c34ff49ff4 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -427,6 +427,7 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 					u64 size, unsigned old_type,
 					unsigned new_type)
 {
+	u64 end;
 	unsigned int i;
 	u64 real_updated_size = 0;
 
@@ -435,21 +436,35 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 	if (size > (ULLONG_MAX - start))
 		size = ULLONG_MAX - start;
 
+	end = start + size;
 	for (i = 0; i < e820x->nr_map; i++) {
 		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
+		u64 ei_end;
+
 		if (ei->type != old_type)
 			continue;
-		/* totally covered? */
-		if (ei->addr >= start &&
-		    (ei->addr + ei->size) <= (start + size)) {
+
+		ei_end = ei->addr + ei->size;
+		/* totally covered by new range? */
+		if (ei->addr >= start && ei_end <= end) {
 			ei->type = new_type;
 			real_updated_size += ei->size;
 			continue;
 		}
+
+		/* new range is totally covered? */
+		if (ei->addr < start && ei_end > end) {
+			__e820_add_region(e820x, start, size, new_type);
+			__e820_add_region(e820x, end, ei_end - end, ei->type);
+			ei->size = start - ei->addr;
+			real_updated_size += size;
+			continue;
+		}
+
 		/* partially covered */
 		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
+		final_end = min(end, ei_end);
 		if (final_start >= final_end)
 			continue;
 
-- 
cgit v1.2.3


From 63516ef6d6a8379ee5c32463efc6a75f9da8a309 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 13 Mar 2009 12:46:07 -0700
Subject: x86: fix get_mtrr() warning about smp_processor_id() with
 CONFIG_PREEMPT=y

Impact: fix debug warning

Jaswinder noticed that there is a warning about smp_processor_id()
in get_mtrr().

Fix it by wrapping the printout into a get/put_cpu() pair.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49BAB7FF.4030107@kernel.org>
[ changed to get/put_cpu(), cleaned up surrounding code a it. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 950c434f793..641ee3507d0 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -381,11 +381,14 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
 	unsigned int tmp, hi;
+	int cpu;
 
 	/*
 	 * get_mtrr doesn't need to update mtrr_state, also it could be called
 	 * from any cpu, so try to print it out directly.
 	 */
+	cpu = get_cpu();
+
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
 
 	if ((mask_lo & 0x800) == 0) {
@@ -393,15 +396,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 		*base = 0;
 		*size = 0;
 		*type = 0;
-		return;
+		goto out_put_cpu;
 	}
 
 	rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
 
-	/* Work out the shifted address mask. */
+	/* Work out the shifted address mask: */
 	tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
 	mask_lo = size_or_mask | tmp;
-	/* Expand tmp with high bits to all 1s*/
+
+	/* Expand tmp with high bits to all 1s: */
 	hi = fls(tmp);
 	if (hi > 0) {
 		tmp |= ~((1<<(hi - 1)) - 1);
@@ -412,15 +416,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 		}
 	}
 
-	/* This works correctly if size is a power of two, i.e. a
-	   contiguous range. */
+	/*
+	 * This works correctly if size is a power of two, i.e. a
+	 * contiguous range:
+	 */
 	*size = -mask_lo;
 	*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
 	*type = base_lo & 0xff;
 
 	printk(KERN_DEBUG "  get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
-			smp_processor_id(), reg, *base, *size,
+			cpu, reg, *base, *size,
 			mtrr_attrib_to_str(*type & 0xff));
+out_put_cpu:
+	put_cpu();
 }
 
 /**
-- 
cgit v1.2.3


From d4c90e37a21154c1910b2646e9544bdd32d5bc3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 13 Mar 2009 14:08:49 -0700
Subject: x86: print the continous part of fixed mtrrs together

Impact: print out fewer lines

 1. print continuous range with same type together
 2. change _INFO to _DEBUG

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49BACB61.8000302@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 58 +++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 641ee3507d0..37f28fc7cf9 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -208,13 +208,47 @@ void mtrr_save_fixed_ranges(void *info)
 		get_fixed_ranges(mtrr_state.fixed_ranges);
 }
 
-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+	if (!last_fixed_end)
+		return;
+
+	printk(KERN_DEBUG "  %05X-%05X %s\n", last_fixed_start,
+		last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+	last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+				       mtrr_type type)
+{
+	last_fixed_start = base;
+	last_fixed_end = end;
+	last_fixed_type = type;
+}
+
+static void __init print_fixed(unsigned base, unsigned step,
+			       const mtrr_type *types)
 {
 	unsigned i;
 
-	for (i = 0; i < 8; ++i, ++types, base += step)
-		printk(KERN_INFO "  %05X-%05X %s\n",
-			base, base + step - 1, mtrr_attrib_to_str(*types));
+	for (i = 0; i < 8; ++i, ++types, base += step) {
+		if (last_fixed_end == 0) {
+			update_fixed_last(base, base + step, *types);
+			continue;
+		}
+		if (last_fixed_end == base && last_fixed_type == *types) {
+			last_fixed_end = base + step;
+			continue;
+		}
+		/* new segments: gap or different type */
+		print_fixed_last();
+		update_fixed_last(base, base + step, *types);
+	}
 }
 
 static void prepare_set(void);
@@ -225,22 +259,26 @@ static void __init print_mtrr_state(void)
 	unsigned int i;
 	int high_width;
 
-	printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
+	printk(KERN_DEBUG "MTRR default type: %s\n",
+			 mtrr_attrib_to_str(mtrr_state.def_type));
 	if (mtrr_state.have_fixed) {
-		printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
+		printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
 		       mtrr_state.enabled & 1 ? "en" : "dis");
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		for (i = 0; i < 2; ++i)
 			print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
 		for (i = 0; i < 8; ++i)
 			print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+
+		/* tail */
+		print_fixed_last();
 	}
-	printk(KERN_INFO "MTRR variable ranges %sabled:\n",
+	printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
 	       mtrr_state.enabled & 2 ? "en" : "dis");
 	high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
 	for (i = 0; i < num_var_ranges; ++i) {
 		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
-			printk(KERN_INFO "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+			printk(KERN_DEBUG "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
 			       i,
 			       high_width,
 			       mtrr_state.var_ranges[i].base_hi,
@@ -250,10 +288,10 @@ static void __init print_mtrr_state(void)
 			       mtrr_state.var_ranges[i].mask_lo >> 12,
 			       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
 		else
-			printk(KERN_INFO "   %u disabled\n", i);
+			printk(KERN_DEBUG "  %u disabled\n", i);
 	}
 	if (mtrr_tom2) {
-		printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
+		printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
 				  mtrr_tom2, mtrr_tom2>>20);
 	}
 }
-- 
cgit v1.2.3


From 48f4c485c275e9550fa1a1191768689cc3ae0037 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Sat, 14 Mar 2009 12:24:02 +0100
Subject: x86/centaur: merge 32 & 64 bit version

there should be no difference, except:

 * the 64bit variant now also initializes the padlock unit.
 * ->c_early_init() is executed again from ->c_init()
 * the 64bit fixups made into 32bit path.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: herbert@gondor.apana.org.au
LKML-Reference: <1237029843-28076-2-git-send-email-sebastian@breakpoint.cc>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile     |  3 +--
 arch/x86/kernel/cpu/centaur.c    | 34 +++++++++++++++++++++++++++-------
 arch/x86/kernel/cpu/centaur_64.c | 37 -------------------------------------
 3 files changed, 28 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/centaur_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d4356f8b752..4e242f9a06e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,8 +19,7 @@ obj-$(CONFIG_X86_CPU_DEBUG)		+= cpu_debug.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_32)	+= centaur.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 983e0830f0d..c95e831bb09 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/bitops.h>
 
 #include <asm/processor.h>
-#include <asm/msr.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
+#include <asm/msr.h>
 
 #include "cpu.h"
 
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 		 */
 		c->x86_capability[5] = cpuid_edx(0xC0000001);
 	}
-
+#ifdef CONFIG_X86_32
 	/* Cyrix III family needs CX8 & PGE explicitly enabled. */
 	if (c->x86_model >= 6 && c->x86_model <= 9) {
 		rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 	/* Before Nehemiah, the C3's had 3dNOW! */
 	if (c->x86_model >= 6 && c->x86_model < 9)
 		set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
 
 	display_cacheinfo(c);
 }
@@ -316,16 +321,25 @@ enum {
 static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
 	switch (c->x86) {
+#ifdef CONFIG_X86_32
 	case 5:
 		/* Emulate MTRRs using Centaur's MCR. */
 		set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
 		break;
+#endif
+	case 6:
+		if (c->x86_model >= 0xf)
+			set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+		break;
 	}
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
 }
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
-
+#ifdef CONFIG_X86_32
 	char *name;
 	u32  fcr_set = 0;
 	u32  fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 	 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
 	 */
 	clear_cpu_cap(c, 0*32+31);
-
+#endif
+	early_init_centaur(c);
 	switch (c->x86) {
+#ifdef CONFIG_X86_32
 	case 5:
 		switch (c->x86_model) {
 		case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 		}
 		sprintf(c->x86_model_id, "WinChip %s", name);
 		break;
-
+#endif
 	case 6:
 		init_c3(c);
 		break;
 	}
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
 }
 
 static unsigned int __cpuinit
 centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
+#ifdef CONFIG_X86_32
 	/* VIA C3 CPUs (670-68F) need further shifting. */
 	if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
 		size >>= 8;
@@ -464,7 +484,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 	if ((c->x86 == 6) && (c->x86_model == 9) &&
 				(c->x86_mask == 1) && (size == 65))
 		size -= 1;
-
+#endif
 	return size;
 }
 
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index 51b09c48c9c..00000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x6 && c->x86_model >= 0xf)
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
-	early_init_centaur(c);
-
-	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static const struct cpu_dev centaur_cpu_dev __cpuinitconst = {
-	.c_vendor	= "Centaur",
-	.c_ident	= { "CentaurHauls" },
-	.c_early_init	= early_init_centaur,
-	.c_init		= init_centaur,
-	.c_x86_vendor	= X86_VENDOR_CENTAUR,
-};
-
-cpu_dev_register(centaur_cpu_dev);
-
-- 
cgit v1.2.3


From f4c3c4cdb1de232ff37cf4339eb2f36c84e20da6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 13 Mar 2009 19:59:26 +0530
Subject: x86: cpu_debug add support for various AMD CPUs

Impact: Added AMD CPUs support

Added flags for various AMD CPUs.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpu_debug.c | 150 ++++++++++++++++++++++++++++------------
 1 file changed, 106 insertions(+), 44 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 21c0cf8ced1..46e29ab96c6 100755
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -64,6 +64,8 @@ static struct cpu_debug_base cpu_base[] = {
 	{ "vmx",	CPU_VMX,	0	},
 	{ "call",	CPU_CALL,	0	},
 	{ "base",	CPU_BASE,	0	},
+	{ "ver",	CPU_VER,	0	},
+	{ "conf",	CPU_CONF,	0	},
 	{ "smm",	CPU_SMM,	0	},
 	{ "svm",	CPU_SVM,	0	},
 	{ "osvm",	CPU_OSVM,	0	},
@@ -177,54 +179,59 @@ static struct cpu_debug_range cpu_intel_range[] = {
 
 /* AMD Registers Range */
 static struct cpu_debug_range cpu_amd_range[] = {
-	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_ALL,		},
-	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_ALL,		},
-	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_ALL,		},
-
-	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_ALL,		},
-	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_ALL,		},
-	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_ALL,		},
-	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_ALL,		},
-	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_ALL,		},
-
-	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_ALL,		},
-	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_ALL,		},
-	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_ALL,		},
-	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_ALL,		},
-	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_ALL,		},
-	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_ALL,		},
-
-	{ 0x00000400, 0x00000417, CPU_MC,	CPU_ALL,		},
-
-	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_ALL,		},
-	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_ALL,		},
-	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_ALL,		},
-	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_ALL,		},
-
-	{ 0xC0000408, 0xC000040A, CPU_MC,	CPU_ALL,		},
-
-	{ 0xc0010000, 0xc0010007, CPU_PMC,	CPU_ALL,		},
-	{ 0xc0010010, 0xc0010010, CPU_MTRR,	CPU_ALL,		},
-	{ 0xc0010016, 0xc001001A, CPU_MTRR,	CPU_ALL,		},
-	{ 0xc001001D, 0xc001001D, CPU_MTRR,	CPU_ALL,		},
-	{ 0xc0010030, 0xc0010035, CPU_BIOS,	CPU_ALL,		},
-	{ 0xc0010056, 0xc0010056, CPU_SMM,	CPU_ALL,		},
-	{ 0xc0010061, 0xc0010063, CPU_SMM,	CPU_ALL,		},
-	{ 0xc0010074, 0xc0010074, CPU_MC,	CPU_ALL,		},
-	{ 0xc0010111, 0xc0010113, CPU_SMM,	CPU_ALL,		},
-	{ 0xc0010114, 0xc0010118, CPU_SVM,	CPU_ALL,		},
-	{ 0xc0010119, 0xc001011A, CPU_SMM,	CPU_ALL,		},
-	{ 0xc0010140, 0xc0010141, CPU_OSVM,	CPU_ALL,		},
-	{ 0xc0010156, 0xc0010156, CPU_SMM,	CPU_ALL,		},
+	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		},
+	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		},
+	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		},
+	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		},
+
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		},
+	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		},
+	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		},
+
+	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		},
+	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		},
+	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		},
+
+	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		},
+	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		},
+	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		},
+	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		},
+	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		},
+	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		},
+	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		},
+	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		},
+	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		},
+	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		},
+	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		},
+	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		},
+	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		},
+	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		},
+	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		},
+	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		},
+	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		},
 };
 
 
-static int get_cpu_modelflag(unsigned cpu)
+/* Intel */
+static int get_intel_modelflag(unsigned model)
 {
 	int flag;
 
-	switch (per_cpu(cpu_model, cpu)) {
-	/* Intel */
+	switch (model) {
 	case 0x0501:
 	case 0x0502:
 	case 0x0504:
@@ -271,6 +278,59 @@ static int get_cpu_modelflag(unsigned cpu)
 	return flag;
 }
 
+/* AMD */
+static int get_amd_modelflag(unsigned model)
+{
+	int flag;
+
+	switch (model >> 8) {
+	case 0x6:
+		flag = CPU_AMD_K6;
+		break;
+	case 0x7:
+		flag = CPU_AMD_K7;
+		break;
+	case 0x8:
+		flag = CPU_AMD_K8;
+		break;
+	case 0xf:
+		flag = CPU_AMD_0F;
+		break;
+	case 0x10:
+		flag = CPU_AMD_10;
+		break;
+	case 0x11:
+		flag = CPU_AMD_11;
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+	int flag;
+
+	flag = per_cpu(cpu_model, cpu);
+
+	switch (flag >> 16) {
+	case X86_VENDOR_INTEL:
+		flag = get_intel_modelflag(flag);
+		break;
+	case X86_VENDOR_AMD:
+		flag = get_amd_modelflag(flag & 0xffff);
+		break;
+	default:
+		flag = CPU_NONE;
+		break;
+	}
+
+	return flag;
+}
+
 static int get_cpu_range_count(unsigned cpu)
 {
 	int index;
@@ -311,7 +371,8 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
 				return 1;
 			break;
 		case X86_VENDOR_AMD:
-			if (cpu_amd_range[i].flag & flag)
+			if ((cpu_amd_range[i].model & modelflag) &&
+			    (cpu_amd_range[i].flag & flag))
 				return 1;
 			break;
 		}
@@ -337,7 +398,8 @@ static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
 		}
 		break;
 	case X86_VENDOR_AMD:
-		if (cpu_amd_range[index].flag & flag) {
+		if ((cpu_amd_range[index].model & modelflag) &&
+		    (cpu_amd_range[index].flag & flag)) {
 			*min = cpu_amd_range[index].min;
 			*max = cpu_amd_range[index].max;
 		}
-- 
cgit v1.2.3


From b9719a4d9cfa5d46fa6a1eb3e3fc2238e7981e4d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 10 Mar 2009 11:19:18 -0700
Subject: x86: make section delimiter symbols part of their section

Impact: cleanup

Move the symbols delimiting a section part of the section
(section relative) rather than absolute.  This avoids any
unexpected gaps between the section-start symbol and the first
data in the section, which could be caused by implicit
alignment of the section data.  It also makes the general
form of vmlinux_64.lds.S consistent with vmlinux_32.lds.S.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux_64.lds.S | 85 +++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 5bf54e40c6e..fe5d21ce724 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
 {
   . = __START_KERNEL;
   phys_startup_64 = startup_64 - LOAD_OFFSET;
-  _text = .;			/* Text and read-only data */
   .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+	_text = .;			/* Text and read-only data */
 	/* First the code that has to be first for bootstrapping */
 	*(.text.head)
 	_stext = .;
@@ -61,13 +61,13 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	DATA_DATA
 	CONSTRUCTORS
+	_edata = .;			/* End of data section */
 	} :data
 
-  _edata = .;			/* End of data section */
 
-  . = ALIGN(PAGE_SIZE);
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
   .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 	*(.data.cacheline_aligned)
   }
   . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
-  . = ALIGN(THREAD_SIZE);	/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+	. = ALIGN(THREAD_SIZE);	/* init_task */
 	*(.data.init_task)
   }:data.init
 
-  . = ALIGN(PAGE_SIZE);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
 	*(.data.page_aligned)
   }
 
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_begin = .;
-  __smp_locks = .;
   .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+	/* might get freed after init */
+	. = ALIGN(PAGE_SIZE);
+	__smp_alt_begin = .;
+	__smp_locks = .;
 	*(.smp_locks)
+	__smp_locks_end = .;
+	. = ALIGN(PAGE_SIZE);
+	__smp_alt_end = .;
   }
-  __smp_locks_end = .;
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_end = .;
 
   . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  __init_begin = .;
+  __init_begin = .;	/* paired with __init_end */
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 	_sinittext = .;
 	INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
 	__initdata_end = .;
    }
 
-  . = ALIGN(16);
-  __setup_start = .;
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
-  __setup_end = .;
-  __initcall_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+	. = ALIGN(16);
+	__setup_start = .;
+	*(.init.setup)
+	__setup_end = .;
+  }
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+	__initcall_start = .;
 	INITCALLS
+	__initcall_end = .;
   }
-  __initcall_end = .;
-  __con_initcall_start = .;
   .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+	__con_initcall_start = .;
 	*(.con_initcall.init)
+	__con_initcall_end = .;
   }
-  __con_initcall_end = .;
-  __x86_cpu_dev_start = .;
   .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	__x86_cpu_dev_start = .;
 	*(.x86_cpu_dev.init)
+	__x86_cpu_dev_end = .;
   }
-  __x86_cpu_dev_end = .;
   SECURITY_INIT
 
   . = ALIGN(8);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  __parainstructions = .;
+	__parainstructions = .;
        *(.parainstructions)
-  __parainstructions_end = .;
+	__parainstructions_end = .;
   }
 
-  . = ALIGN(8);
-  __alt_instructions = .;
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	. = ALIGN(8);
+	__alt_instructions = .;
 	*(.altinstructions)
+	__alt_instructions_end = .;
   }
-  __alt_instructions_end = .;
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
@@ -207,9 +209,11 @@ SECTIONS
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(PAGE_SIZE);
-  __initramfs_start = .;
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
-  __initramfs_end = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+	__initramfs_start = .;
+	*(.init.ramfs)
+	__initramfs_end = .;
+  }
 #endif
 
 #ifdef CONFIG_SMP
@@ -229,20 +233,21 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __nosave_begin = .;
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-      *(.data.nosave)
+	. = ALIGN(PAGE_SIZE);
+	__nosave_begin = .;
+	*(.data.nosave)
+	. = ALIGN(PAGE_SIZE);
+	__nosave_end = .;
   } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-  . = ALIGN(PAGE_SIZE);
-  __nosave_end = .;
 
-  __bss_start = .;		/* BSS */
   .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+	. = ALIGN(PAGE_SIZE);
+	__bss_start = .;		/* BSS */
 	*(.bss.page_aligned)
 	*(.bss)
-	}
-  __bss_stop = .;
+	__bss_stop = .;
+  }
 
   _end = . ;
 
-- 
cgit v1.2.3


From 93dbda7cbcd70a0bd1a99f39f44a9ccde8ab9040 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Feb 2009 17:35:44 -0800
Subject: x86: add brk allocation for very, very early allocations

Impact: new interface

Add a brk()-like allocator which effectively extends the bss in order
to allow very early code to do dynamic allocations.  This is better than
using statically allocated arrays for data in subsystems which may never
get used.

The space for brk allocations is in the bss ELF segment, so that the
space is mapped properly by the code which maps the kernel, and so
that bootloaders keep the space free rather than putting a ramdisk or
something into it.

The bss itself, delimited by __bss_stop, ends before the brk area
(__brk_base to __brk_limit).  The kernel text, data and bss is reserved
up to __bss_stop.

Any brk-allocated data is reserved separately just before the kernel
pagetable is built, as that code allocates from unreserved spaces
in the e820 map, potentially allocating from any unused brk memory.
Ultimately any unused memory in the brk area is used in the general
kernel memory pool.

Initially the brk space is set to 1MB, which is probably much larger
than any user needs (the largest current user is i386 head_32.S's code
to build the pagetables to map the kernel, which can get fairly large
with a big kernel image and no PSE support).  So long as the system
has sufficient memory for the bootloader to reserve the kernel+1MB brk,
there are no bad effects resulting from an over-large brk.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head32.c         |  2 +-
 arch/x86/kernel/head64.c         |  2 +-
 arch/x86/kernel/setup.c          | 39 ++++++++++++++++++++++++++++++++++-----
 arch/x86/kernel/vmlinux_32.lds.S |  7 +++++++
 arch/x86/kernel/vmlinux_64.lds.S |  5 +++++
 5 files changed, 48 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe18..29f1095b084 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
 {
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b27224769..70eaa852c73 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 
 	reserve_trampoline_memory();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f28c56e6bf9..e6b742d2a3f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,9 @@
 
 unsigned int boot_cpu_id __read_mostly;
 
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
 {
@@ -337,6 +340,34 @@ static void __init relocate_initrd(void)
 }
 #endif
 
+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(_brk_start == 0);
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+	BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	memset(ret, 0, size);
+
+	return ret;
+}
+
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+	/* Mark brk area as locked down and no longer taking any new allocations */
+	_brk_start = 0;
+}
+
 static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -717,11 +748,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-	init_mm.brk = (unsigned long) &_end;
-#endif
+	init_mm.brk = _brk_end;
 
 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
@@ -842,6 +869,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
+	reserve_brk();
+
 	/* max_pfn_mapped is updated here */
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f26..27e44aa2158 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,7 +189,14 @@ SECTIONS
 	*(.bss)
 	. = ALIGN(4);
 	__bss_stop = .;
+
+	. = ALIGN(PAGE_SIZE);
+	__brk_base = . ;
+	. += 1024 * 1024 ;
+	__brk_limit = . ;
+
   	_end = . ;
+
 	/* This is where the kernel creates the early boot page tables */
 	. = ALIGN(PAGE_SIZE);
 	pg0 = . ;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fe5d21ce724..ff373423138 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -247,6 +247,11 @@ SECTIONS
 	*(.bss.page_aligned)
 	*(.bss)
 	__bss_stop = .;
+
+	. = ALIGN(PAGE_SIZE);
+	__brk_base = . ;
+	. += 1024 * 1024 ;
+	__brk_limit = . ;
   }
 
   _end = . ;
-- 
cgit v1.2.3


From 5368a2be34b96fda9c05d79424fa63a73ead04ed Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 14 Mar 2009 17:19:51 -0700
Subject: x86: move brk initialization out of #ifdef CONFIG_BLK_DEV_INITRD

Impact: build fix

The brk initialization functions were incorrectly located inside
an #ifdef CONFIG_VLK_DEV_INITRD block, causing the obvious build failure in
minimal configurations.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/setup.c | 57 +++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e6b742d2a3f..b8329029c15 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -272,6 +272,35 @@ static inline void copy_edd(void)
 }
 #endif
 
+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(_brk_start == 0);
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+	BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	memset(ret, 0, size);
+
+	return ret;
+}
+
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+	/* Mark brk area as locked down and no longer taking any
+	   new allocations */
+	_brk_start = 0;
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD
 
 #ifdef CONFIG_X86_32
@@ -340,34 +369,6 @@ static void __init relocate_initrd(void)
 }
 #endif
 
-void * __init extend_brk(size_t size, size_t align)
-{
-	size_t mask = align - 1;
-	void *ret;
-
-	BUG_ON(_brk_start == 0);
-	BUG_ON(align & mask);
-
-	_brk_end = (_brk_end + mask) & ~mask;
-	BUG_ON((char *)(_brk_end + size) > __brk_limit);
-
-	ret = (void *)_brk_end;
-	_brk_end += size;
-
-	memset(ret, 0, size);
-
-	return ret;
-}
-
-static void __init reserve_brk(void)
-{
-	if (_brk_end > _brk_start)
-		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
-
-	/* Mark brk area as locked down and no longer taking any new allocations */
-	_brk_start = 0;
-}
-
 static void __init reserve_initrd(void)
 {
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-- 
cgit v1.2.3


From ccf3fe02e35f4abca2589f99022cc25084bbd8ae Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Feb 2009 13:27:38 -0800
Subject: x86-32: use brk segment for allocating initial kernel pagetable

Impact: use new interface instead of previous ad hoc implementation

Rather than having special purpose init_pg_table_start/end variables
to delimit the kernel pagetable built by head_32.S, just use the brk
mechanism to extend the bss for the new pagetable.

This patch removes init_pg_table_start/end and pg0, defines __brk_base
(which is page-aligned and immediately follows _end), initializes
the brk region to start there, and uses it for the 32-bit pagetable.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head32.c         |  3 ---
 arch/x86/kernel/head_32.S        | 14 +++++++-------
 arch/x86/kernel/setup.c          |  6 ------
 arch/x86/kernel/vmlinux_32.lds.S |  4 ----
 4 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 29f1095b084..3f8579f8d42 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
-
 	reserve_ebda_region();
 
 	/*
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591..db6652710e9 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -167,7 +167,7 @@ num_subarch_entries = (. - subarch_entries) / 4
 /*
  * Initialize page tables.  This creates a PDE and a set of page
  * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
  * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
  *
@@ -190,8 +190,7 @@ default_entry:
 
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -216,7 +215,8 @@ default_entry:
 	cmpl %ebp,%eax
 	jb 10b
 1:
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
@@ -227,8 +227,7 @@ default_entry:
 
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
-	movl $pa(pg0), %edi
-	movl %edi, pa(init_pg_tables_start)
+	movl $pa(__brk_base), %edi
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
@@ -249,7 +248,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
-	movl %edi,pa(init_pg_tables_end)
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
 	shrl $12, %eax
 	movl %eax, pa(max_pfn_mapped)
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b8329029c15..3ac2aa7b9ea 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -161,12 +161,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 27e44aa2158..1063fbe93c7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -196,10 +196,6 @@ SECTIONS
 	__brk_limit = . ;
 
   	_end = . ;
-
-	/* This is where the kernel creates the early boot page tables */
-	. = ALIGN(PAGE_SIZE);
-	pg0 = . ;
   }
 
   /* Sections to be discarded */
-- 
cgit v1.2.3


From 6de6cb442e76bbaf2e685150be8ddac0f237a59c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Feb 2009 13:35:45 -0800
Subject: x86: use brk allocation for DMI

Impact: use new interface instead of previous ad hoc implementation

Use extend_brk() to allocate memory for DMI rather than having an
ad-hoc allocator.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/setup.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3ac2aa7b9ea..e894f36335f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -215,12 +215,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
 /*
  * Setup options
  */
-- 
cgit v1.2.3


From 7543c1de84ed93c6769c9f20dced08a522af8912 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 12 Mar 2009 16:04:42 -0700
Subject: x86-32: compute initial mapping size more accurately

Impact: simplification

We only need to map the kernel in head_32.S, not the whole of
lowmem.  We use 512MB as a reasonable (but arbitrary) limit on
the maximum size of the kernel image.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index db6652710e9..3ce5456dfbe 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -54,7 +54,7 @@
  *
  * This should be a multiple of a page.
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
 /*
  * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
-- 
cgit v1.2.3


From 796216a57fe45c04adc35bda1f0782efec78a713 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Mar 2009 16:09:49 -0700
Subject: x86: allow extend_brk users to reserve brk space

Impact: new interface; remove hard-coded limit

Add RESERVE_BRK(name, size) macro to reserve space in the brk
area.  This should be a conservative (ie, larger) estimate of
how much space might possibly be required from the brk area.
Any unused space will be freed, so there's no real downside
on making the reservation too large (within limits).

The name should be unique within a given file, and somewhat
descriptive.

The C definition of RESERVE_BRK() ends up being more complex than
one would expect to work around a cluster of gcc infelicities:

  The first attempt was to simply try putting __section(.brk_reservation)
  on a variable.  This doesn't work because it ends up making it a
  @progbits section, which gets actual space allocated in the vmlinux
  executable.

  The second attempt was to emit the space into a section using asm,
  but gcc doesn't allow arguments to be passed to file-level asm()
  statements, making it hard to pass in the size.

  The final attempt is to wrap the asm() in a function to allow
  it to have arguments, and put the function itself into the
  .discard section, which vmlinux*.lds drops entirely from the
  emitted vmlinux.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_32.S        | 2 ++
 arch/x86/kernel/setup.c          | 2 ++
 arch/x86/kernel/vmlinux_32.lds.S | 4 +++-
 arch/x86/kernel/vmlinux_64.lds.S | 4 +++-
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 3ce5456dfbe..9e89f2a14b9 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -75,6 +75,8 @@ ALLOCATOR_SLOP = 4
 
 INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
 
+RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE)
+
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
  * %esi points to the real-mode code as a 32-bit pointer.
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e894f36335f..a0d26237d7c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,8 @@
 #define ARCH_SETUP
 #endif
 
+RESERVE_BRK(dmi_alloc, 65536);
+
 unsigned int boot_cpu_id __read_mostly;
 
 static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 1063fbe93c7..a1f28b85fb3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -192,7 +192,8 @@ SECTIONS
 
 	. = ALIGN(PAGE_SIZE);
 	__brk_base = . ;
-	. += 1024 * 1024 ;
+	. += 64 * 1024 ;	/* 64k slop space */
+	*(.brk_reservation)	/* areas brk users have reserved */
 	__brk_limit = . ;
 
   	_end = . ;
@@ -201,6 +202,7 @@ SECTIONS
   /* Sections to be discarded */
   /DISCARD/ : {
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   STABS_DEBUG
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ff373423138..7996687663a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -250,7 +250,8 @@ SECTIONS
 
 	. = ALIGN(PAGE_SIZE);
 	__brk_base = . ;
-	. += 1024 * 1024 ;
+	. += 64 * 1024;		/* 64k slop space */
+	*(.brk_reservation)	/* areas brk users have reserved */
 	__brk_limit = . ;
   }
 
@@ -260,6 +261,7 @@ SECTIONS
   /DISCARD/ : {
 	*(.exitcall.exit)
 	*(.eh_frame)
+	*(.discard)
 	}
 
   STABS_DEBUG
-- 
cgit v1.2.3


From 2bd2753ff46346543ab92e80df9d96366e21baa5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 9 Mar 2009 01:15:57 -0700
Subject: x86: put initial_pg_tables into .bss

Impact: makes vmlinux section information more useful

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

[Adapted to use brk segment - Jeremy]

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_32.S        | 43 +++++++++++++---------------------------
 arch/x86/kernel/vmlinux_32.lds.S |  6 ++++++
 2 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9e89f2a14b9..c79741cfb07 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -41,41 +41,28 @@
  * This is how much memory *in addition to the memory covered up to
  * and including _end* we need mapped initially.
  * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
- *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
 LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
-
 #if PTRS_PER_PMD > 1
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
 #else
 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
-
-RESERVE_BRK(pagetables, PAGE_TABLE_SIZE * PAGE_SIZE)
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -168,10 +155,10 @@ num_subarch_entries = (. - subarch_entries) / 4
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
+ * tables, which are located immediately beyond __brk_base.  The variable
  * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end.
  *
  * Note that the stack is not yet set up!
  */
@@ -210,10 +197,9 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables.
+	 * End condition: we must map up to the end.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end) + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
@@ -243,11 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
-	 * bytes beyond the end of our own page tables; the +0x007 is
-	 * the attribute bits
+	 * End condition: we must map up to end
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+	movl $pa(_end) + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 	addl $__PAGE_OFFSET, %edi
@@ -638,6 +622,7 @@ swapper_pg_fixmap:
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
+
 /*
  * This starts the data section.
  */
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a1f28b85fb3..98424f33e07 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -210,6 +210,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
-- 
cgit v1.2.3


From 6d7942dc2a70a7e74c352107b150265602671588 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 14 Mar 2009 14:32:41 -0700
Subject: x86: fix 64k corruption-check

Impact: fix boot crash

Need to exit early if the addr is far above 64k.

The crash got exposed by:

  78a8b35: x86: make e820_update_range() handle small range update

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: <stable@kernel.org>
LKML-Reference: <49BC2279.2030101@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/check.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index b617b1164f1..fc999e6fc46 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -86,12 +86,12 @@ void __init setup_bios_corruption_check(void)
 		if (!(addr + 1))
 			break;
 
+		if (addr >= corruption_check_size)
+			break;
+
 		if ((addr + size) > corruption_check_size)
 			size = corruption_check_size - addr;
 
-		if (size == 0)
-			break;
-
 		e820_update_range(addr, size, E820_RAM, E820_RESERVED);
 		scan_areas[num_scan_areas].addr = addr;
 		scan_areas[num_scan_areas].size = size;
-- 
cgit v1.2.3


From c61cf4cfe7c73c7aa62dde3ff82cd475b9c41481 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 15 Mar 2009 00:59:19 -0700
Subject: x86: print out more info in e820_update_range()

Impact: help debug e820 bugs

Try to print out more info, to catch wrong call parameters.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49BCB557.3030000@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 56 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c34ff49ff4..fb638d9ce6d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -131,6 +131,31 @@ void __init e820_add_region(u64 start, u64 size, int type)
 	__e820_add_region(&e820, start, size, type);
 }
 
+static void __init e820_print_type(u32 type)
+{
+	switch (type) {
+	case E820_RAM:
+	case E820_RESERVED_KERN:
+		printk(KERN_CONT "(usable)");
+		break;
+	case E820_RESERVED:
+		printk(KERN_CONT "(reserved)");
+		break;
+	case E820_ACPI:
+		printk(KERN_CONT "(ACPI data)");
+		break;
+	case E820_NVS:
+		printk(KERN_CONT "(ACPI NVS)");
+		break;
+	case E820_UNUSABLE:
+		printk(KERN_CONT "(unusable)");
+		break;
+	default:
+		printk(KERN_CONT "type %u", type);
+		break;
+	}
+}
+
 void __init e820_print_map(char *who)
 {
 	int i;
@@ -140,27 +165,8 @@ void __init e820_print_map(char *who)
 		       (unsigned long long) e820.map[i].addr,
 		       (unsigned long long)
 		       (e820.map[i].addr + e820.map[i].size));
-		switch (e820.map[i].type) {
-		case E820_RAM:
-		case E820_RESERVED_KERN:
-			printk(KERN_CONT "(usable)\n");
-			break;
-		case E820_RESERVED:
-			printk(KERN_CONT "(reserved)\n");
-			break;
-		case E820_ACPI:
-			printk(KERN_CONT "(ACPI data)\n");
-			break;
-		case E820_NVS:
-			printk(KERN_CONT "(ACPI NVS)\n");
-			break;
-		case E820_UNUSABLE:
-			printk("(unusable)\n");
-			break;
-		default:
-			printk(KERN_CONT "type %u\n", e820.map[i].type);
-			break;
-		}
+		e820_print_type(e820.map[i].type);
+		printk(KERN_CONT "\n");
 	}
 }
 
@@ -437,6 +443,14 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
+	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+		       (unsigned long long) start,
+		       (unsigned long long) end);
+	e820_print_type(old_type);
+	printk(KERN_CONT " ==> ");
+	e820_print_type(new_type);
+	printk(KERN_CONT "\n");
+
 	for (i = 0; i < e820x->nr_map; i++) {
 		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
-- 
cgit v1.2.3


From 514ec49a5f5146a6c0ade1a688787acf13617868 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Mon, 16 Mar 2009 17:07:33 +0900
Subject: x86, mce: remove incorrect __cpuinit for intel_init_cmci()

Impact: Bug fix on UP

Referring commit cc3ca22063784076bd240fda87217387a8f2ae92,
Peter removed __cpuinit annotations for mce_cpu_features()
and its successor functions, which caused troubles on UP
configurations.

However the intel_init_cmci() was introduced after that and
it also has __cpuinit annotation even though it is called from
mce_cpu_features(). Remove the annotation from that function
too.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aaa7d973093..57df3d38347 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -270,7 +270,7 @@ void cmci_reenable(void)
 		cmci_discover(banks, 0);
 }
 
-static __cpuinit void intel_init_cmci(void)
+static void intel_init_cmci(void)
 {
 	int banks;
 
-- 
cgit v1.2.3


From 250981e6e1ef04947eccaa55e8c25ddcaa47a02e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Mar 2009 13:07:21 +0100
Subject: x86: reduce preemption off section in exit thread

Impact: latency improvement

No need to keep preemption disabled over the kfree call.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6afa5232dbb..156f87582c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -65,11 +65,11 @@ void exit_thread(void)
 {
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
+	unsigned long *bp = t->io_bitmap_ptr;
 
-	if (me->thread.io_bitmap_ptr) {
+	if (bp) {
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
-		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
 		/*
@@ -78,6 +78,7 @@ void exit_thread(void)
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
+		kfree(bp);
 	}
 
 	ds_exit_thread(current);
-- 
cgit v1.2.3


From f0348c438c9ea156194d31fadde4a9e75522196f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 16 Mar 2009 16:33:59 -0700
Subject: x86: MTRR workaround for system with stange var MTRRs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: don't trim e820 according to wrong mtrr

Ozan reports that his server emits strange warning.
it turns out the BIOS sets the MTRRs incorrectly.

Ignore those strange ranges, and don't trim e820,
just emit one warning about BIOS

Reported-by: Ozan Çağlayan <ozan@pardus.org.tr>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49BEE1E7.7020706@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 11 +++++++++++
 arch/x86/kernel/cpu/mtrr/main.c    | 16 ++++++++--------
 arch/x86/kernel/cpu/mtrr/mtrr.h    |  1 +
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 58b58bbf7eb..f4f89fbc228 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -189,6 +189,17 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 		if (!size)
 			continue;
 		base = range_state[i].base_pfn;
+		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+		    (mtrr_state.enabled & 1)) {
+			/* Var MTRR contains UC entry below 1M? Skip it: */
+			printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
+				"contains strange UC entry under 1M, check "
+				"with your system vendor!\n", i);
+			if (base + size <= (1<<(20-PAGE_SHIFT)))
+				continue;
+			size -= (1<<(20-PAGE_SHIFT)) - base;
+			base = 1<<(20-PAGE_SHIFT);
+		}
 		subtract_range(range, base, base + size - 1);
 	}
 	if (extra_remove_size)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c2e266f41d..03cda01f57c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
 	unsigned long	lsize;
 };
 
-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 
 	for (i = 0; i < num_var_ranges; i++) {
 		mtrr_if->get(i,
-			     &mtrr_state[i].lbase,
-			     &mtrr_state[i].lsize,
-			     &mtrr_state[i].ltype);
+			     &mtrr_value[i].lbase,
+			     &mtrr_value[i].lsize,
+			     &mtrr_value[i].ltype);
 	}
 	return 0;
 }
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
 	int i;
 
 	for (i = 0; i < num_var_ranges; i++) {
-		if (mtrr_state[i].lsize) 
+		if (mtrr_value[i].lsize)
 			set_mtrr(i,
-				 mtrr_state[i].lbase,
-				 mtrr_state[i].lsize,
-				 mtrr_state[i].ltype);
+				 mtrr_value[i].lbase,
+				 mtrr_value[i].lsize,
+				 mtrr_value[i].ltype);
 	}
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 6710e93021a..77f67f7b347 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
 
 extern unsigned int num_var_ranges;
 extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
-- 
cgit v1.2.3


From c090f532db3ab5b7be503f8ac84b0d33a18646c6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 16 Mar 2009 12:07:54 -0700
Subject: x86-32: make sure we map enough to fit linear map pagetables

Impact: crash fix

head_32.S needs to map the kernel itself, and enough space so
that mm/init.c can allocate space from the e820 allocator
for the linear map of low memory.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/head_32.S | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c79741cfb07..d383a7c0e49 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,8 +38,8 @@
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
  * We need:
  *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
  *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
@@ -52,16 +52,25 @@
  * KERNEL_IMAGE_SIZE should be greater than pa(_end)
  * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT
 
 #if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
 #else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
 ALLOCATOR_SLOP = 4
 
-INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE)
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * the worst-case size of the kernel itself, plus the extra we need
+ * to map for the linear map.
+ */
+KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm
 RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
@@ -197,9 +206,9 @@ default_entry:
 	loop 11b
 
 	/*
-	 * End condition: we must map up to the end.
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
 	 */
-	movl $pa(_end) + PTE_IDENT_ATTR, %ebp
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
@@ -229,9 +238,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	addl $0x1000,%eax
 	loop 11b
 	/*
-	 * End condition: we must map up to end
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
 	 */
-	movl $pa(_end) + PTE_IDENT_ATTR, %ebp
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
 	cmpl %ebp,%eax
 	jb 10b
 	addl $__PAGE_OFFSET, %edi
-- 
cgit v1.2.3


From b8a22a6273d5aed248db2695ce9bf65eb3e9fbe6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 16 Mar 2009 12:10:07 -0700
Subject: x86-32: remove ALLOCATOR_SLOP from head_32.S

Impact: cleanup

ALLOCATOR_SLOP is a vestigial remain from when we used the
bootmem allocator to allocate the kernel's linear memory mapping.
Now we directly reserve pages from the e820 mapping, and no
longer require secondary structures to keep track of allocated
pages.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/head_32.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d383a7c0e49..fc884b90b6d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -58,7 +58,6 @@
 #else
 #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
-ALLOCATOR_SLOP = 4
 
 /* Enough space to fit pagetables for the low memory linear map */
 MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE)
@@ -70,7 +69,7 @@ MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE)
  */
 KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
 
-INIT_MAP_SIZE = (PAGE_TABLE_SIZE(KERNEL_PAGES) + ALLOCATOR_SLOP) * PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
 RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
-- 
cgit v1.2.3


From 60ac98213914cc615c67e84bfb964aa95a080d13 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 17 Mar 2009 11:38:23 -0700
Subject: x86-32: tighten the bound on additional memory to map

Impact: Tighten bound to avoid masking errors

The definition of MAPPING_BEYOND_END was excessive; this has a nasty
tendency to mask bugs.  We have learned over time that this kind of
bug hiding can cause some very strange errors.  Therefore, tighten the
bound to only need to map the actual kernel area.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/kernel/head_32.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fc884b90b6d..30683883e0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,7 +60,8 @@
 #endif
 
 /* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = (PAGE_TABLE_SIZE(1 << (32 - PAGE_SHIFT)) * PAGE_SIZE)
+MAPPING_BEYOND_END = \
+	PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
 
 /*
  * Worst-case size of the kernel mapping we need to make:
-- 
cgit v1.2.3


From 704439ddf9f8ff1fc8c6d8abaac4bc4c95697e39 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 14 Mar 2009 23:20:47 -0700
Subject: x86/brk: put the brk reservations in their own section

Impact: disambiguate real .bss variables from .brk storage

Add a .brk section after the .bss section.  This has no effect
on the final vmlinux, but it more clearly distinguishes the space
taken by actual .bss symbols, and the variable space reserved
by .brk users.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/vmlinux_32.lds.S | 8 +++++---
 arch/x86/kernel/vmlinux_64.lds.S | 4 +++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 98424f33e07..de14973e47f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,16 +189,18 @@ SECTIONS
 	*(.bss)
 	. = ALIGN(4);
 	__bss_stop = .;
+  }
 
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
 	. = ALIGN(PAGE_SIZE);
 	__brk_base = . ;
-	. += 64 * 1024 ;	/* 64k slop space */
+ 	. += 64 * 1024 ;	/* 64k alignment slop space */
 	*(.brk_reservation)	/* areas brk users have reserved */
 	__brk_limit = . ;
-
-  	_end = . ;
   }
 
+  _end = . ;
+
   /* Sections to be discarded */
   /DISCARD/ : {
 	*(.exitcall.exit)
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 7996687663a..c8742507b03 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -247,10 +247,12 @@ SECTIONS
 	*(.bss.page_aligned)
 	*(.bss)
 	__bss_stop = .;
+  }
 
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
 	. = ALIGN(PAGE_SIZE);
 	__brk_base = . ;
-	. += 64 * 1024;		/* 64k slop space */
+ 	. += 64 * 1024 ;	/* 64k alignment slop space */
 	*(.brk_reservation)	/* areas brk users have reserved */
 	__brk_limit = . ;
   }
-- 
cgit v1.2.3


From 0a699af8e613a670be50245366fa18cb19ac5172 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 17 Mar 2009 14:14:31 -0700
Subject: x86-32: move _end to a dummy section

Impact: build fix with CONFIG_RELOCATABLE

Move _end into a dummy section, so that relocs.c will know it is a
relocatable symbol.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
---
 arch/x86/kernel/vmlinux_32.lds.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index de14973e47f..62ad500d55f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -199,7 +199,9 @@ SECTIONS
 	__brk_limit = . ;
   }
 
-  _end = . ;
+  .end : AT(ADDR(.end) - LOAD_OFFSET) {
+	_end = . ;
+  }
 
   /* Sections to be discarded */
   /DISCARD/ : {
-- 
cgit v1.2.3


From 9d783ba042771284fb4ee5013c3d94220755ae7f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:04:55 -0700
Subject: x86, x2apic: enable fault handling for intr-remapping

Impact: interface augmentation (not yet used)

Enable fault handling flow for intr-remapping aswell. Fault handling
code now shared by both dma-remapping and intr-remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c  | 9 +++++++--
 arch/x86/kernel/apic/probe_64.c | 9 +++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc..b18a7734d68 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3294,7 +3294,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 	} else
 #endif
 	{
-		msg->address_hi = MSI_ADDR_BASE_HI;
+		if (x2apic_enabled())
+			msg->address_hi = MSI_ADDR_BASE_HI |
+					  MSI_ADDR_EXT_DEST_ID(dest);
+		else
+			msg->address_hi = MSI_ADDR_BASE_HI;
+
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((apic->irq_dest_mode == 0) ?
@@ -3528,7 +3533,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 	destroy_irq(irq);
 }
 
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a..8297c2b8ed2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,15 @@ void __init default_setup_apic_routing(void)
 			apic = &apic_physflat;
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
+
+#ifdef CONFIG_X86_X2APIC
+	/*
+	 * Now that apic routing model is selected, configure the
+	 * fault handling for intr remapping.
+	 */
+	if (intr_remapping_enabled)
+		enable_drhd_fault_handling();
+#endif
 }
 
 /* Same for both flat and physical. */
-- 
cgit v1.2.3


From 7c6d9f9785d156d0438401ef270322da3d5247db Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:04:59 -0700
Subject: x86, x2apic: use virtual wire A mode in disable_IO_APIC() with
 interrupt-remapping

Impact: make kexec work with x2apic

disable_IO_APIC() gets called during crashdump aswell, which configures the
IO-APIC/LAPIC so that legacy interrupts can be delivered for the kexec'd kernel.

In the presence of interrupt-remapping, we need to change the
interrupt-remapping configuration aswell as modifying IO-APIC for virtual wire
B mode.

To keep things simple during the crash, use virtual wire A mode
(for which we don't need to touch io-apic and interrupt-remapping tables).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b18a7734d68..4d975d0e358 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2040,8 +2040,13 @@ void disable_IO_APIC(void)
 	 * If the i8259 is routed through an IOAPIC
 	 * Put that IOAPIC in virtual wire mode
 	 * so legacy interrupts can be delivered.
+	 *
+	 * With interrupt-remapping, for now we will use virtual wire A mode,
+	 * as virtual wire B is little complex (need to configure both
+	 * IOAPIC RTE aswell as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for now.
 	 */
-	if (ioapic_i8259.pin != -1) {
+	if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2066,10 @@ void disable_IO_APIC(void)
 		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 
-	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+	/*
+	 * Use virtual wire A mode when interrupt remapping is enabled.
+	 */
+	disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From cf6567fe40c55e9cffca7355cd34e50fb2871e4e Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:00 -0700
Subject: x86, x2apic: fix clear_local_APIC() in the presence of x2apic

Impact: cleanup, paranoia

We were not clearing the local APIC in clear_local_APIC() in the
presence of x2apic. Fix it.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 30909a258d0..699f8cf76bb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -809,7 +809,7 @@ void clear_local_APIC(void)
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-	if (!apic_phys)
+	if (!x2apic && !apic_phys)
 		return;
 
 	maxlvt = lapic_get_maxlvt();
@@ -1523,12 +1523,10 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-#ifdef CONFIG_X86_X2APIC
 	if (x2apic) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
 	}
-#endif
 
 	/*
 	 * If no local APIC can be found then set up a fake all
@@ -1972,12 +1970,9 @@ static int lapic_resume(struct sys_device *dev)
 
 	local_irq_save(flags);
 
-#ifdef CONFIG_X86_X2APIC
 	if (x2apic)
 		enable_x2apic();
-	else
-#endif
-	{
+	else {
 		/*
 		 * Make sure the APICBASE points to the right address
 		 *
-- 
cgit v1.2.3


From 0280f7c416c652a2fd95d166f52b199ae61122c0 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:01 -0700
Subject: x86, x2apic: cleanup the IO-APIC level migration with
 interrupt-remapping

Impact: simplification

In the current code, for level triggered migration, we need to modify the
io-apic RTE with the update vector information, along with modifying interrupt
remapping table entry(IRTE) with vector and destination. This is to ensure that
remote IRR bit inthe IOAPIC RTE gets cleared when the cpu does EOI.

With this patch, for level triggered, we eliminate the io-apic RTE modification
(with the updated vector information), by using a virtual vector (io-apic pin
number).  Real vector that is used for interrupting cpu will be coming from
the interrupt-remapping table entry. Trigger mode in the IRTE will always be
edge, and the actual level or edge trigger will be setup in the IO-APIC RTE.
So a level triggered interrupt will appear as an edge to the local apic
cpu but still as level to the IO-APIC.

With this change, level irq migration can be done by simply modifying
the interrupt-remapping table entry with out changing the io-apic RTE.
And as the interrupt appears as edge at the cpu, in addition to do the
local apic EOI, we need to do IO-APIC directed EOI to clear the remote
IRR bit in  the IO-APIC RTE.

This simplies the irq migration in the presence of interrupt-remapping.

Idea-by: Rajesh Sankaran <rajesh.sankaran@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 156 +++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 91 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4d975d0e358..e074eac5bd3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
 	unsigned int data;
+	unsigned int unused2[11];
+	unsigned int eoi;
 };
 
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
 }
 
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(vector, &io_apic->eoi);
+}
+
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -1478,7 +1486,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 int setup_ioapic_entry(int apic_id, int irq,
 		       struct IO_APIC_route_entry *entry,
 		       unsigned int destination, int trigger,
-		       int polarity, int vector)
+		       int polarity, int vector, int pin)
 {
 	/*
 	 * add it to the IO-APIC irq-routing table:
@@ -1504,7 +1512,14 @@ int setup_ioapic_entry(int apic_id, int irq,
 
 		irte.present = 1;
 		irte.dst_mode = apic->irq_dest_mode;
-		irte.trigger_mode = trigger;
+		/*
+		 * Trigger mode in the IRTE will always be edge, and the
+		 * actual level or edge trigger will be setup in the IO-APIC
+		 * RTE. This will help simplify level triggered irq migration.
+		 * For more details, see the comments above explainig IO-APIC
+		 * irq migration in the presence of interrupt-remapping.
+		 */
+		irte.trigger_mode = 0;
 		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1530,23 @@ int setup_ioapic_entry(int apic_id, int irq,
 		ir_entry->zero = 0;
 		ir_entry->format = 1;
 		ir_entry->index = (index & 0x7fff);
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		ir_entry->vector = pin;
 	} else
 #endif
 	{
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
 		entry->dest = destination;
+		entry->vector = vector;
 	}
 
 	entry->mask = 0;				/* enable IRQ */
 	entry->trigger = trigger;
 	entry->polarity = polarity;
-	entry->vector = vector;
 
 	/* Mask level triggered irqs.
 	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1581,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 
 
 	if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
-			       dest, trigger, polarity, cfg->vector)) {
+			       dest, trigger, polarity, cfg->vector, pin)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic_id].apicid, pin);
 		__clear_irq_vector(irq, cfg);
@@ -2311,37 +2331,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little  bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
  *
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
  */
 static void
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
-	int modify_ioapic_rte;
 	unsigned int dest;
-	unsigned long flags;
 	unsigned int irq;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2359,13 +2366,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
-	modify_ioapic_rte = desc->status & IRQ_LEVEL;
-	if (modify_ioapic_rte) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-	}
-
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
 
@@ -2380,73 +2380,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	cpumask_copy(desc->affinity, mask);
 }
 
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
-	int ret = -1;
-	struct irq_cfg *cfg = desc->chip_data;
-
-	mask_IO_APIC_irq_desc(desc);
-
-	if (io_apic_level_ack_pending(cfg)) {
-		/*
-		 * Interrupt in progress. Migrating irq now will change the
-		 * vector information in the IO-APIC RTE and that will confuse
-		 * the EOI broadcast performed by cpu.
-		 * So, delay the irq migration to the next instance.
-		 */
-		schedule_delayed_work(&ir_migration_work, 1);
-		goto unmask;
-	}
-
-	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, desc->pending_mask);
-
-	ret = 0;
-	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(desc->pending_mask);
-
-unmask:
-	unmask_IO_APIC_irq_desc(desc);
-
-	return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
-	unsigned int irq;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		if (desc->status & IRQ_MOVE_PENDING) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&desc->lock, flags);
-			if (!desc->chip->set_affinity ||
-			    !(desc->status & IRQ_MOVE_PENDING)) {
-				desc->status &= ~IRQ_MOVE_PENDING;
-				spin_unlock_irqrestore(&desc->lock, flags);
-				continue;
-			}
-
-			desc->chip->set_affinity(irq, desc->pending_mask);
-			spin_unlock_irqrestore(&desc->lock, flags);
-		}
-	}
-}
-
 /*
  * Migrates the IRQ destination in the process context.
  */
 static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	if (desc->status & IRQ_LEVEL) {
-		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(desc->pending_mask, mask);
-		migrate_irq_remapped_level_desc(desc);
-		return;
-	}
-
 	migrate_ioapic_irq_desc(desc, mask);
 }
 static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2537,9 +2476,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		io_apic_eoi(apic, pin);
+		entry = entry->next;
+	}
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ack_x2apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	ack_x2APIC_irq();
+	eoi_ioapic_irq(desc);
 }
 
 static void ack_x2apic_edge(unsigned int irq)
-- 
cgit v1.2.3


From 29b61be65a33c95564fa82e7e8d60d97adb68ea8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:02 -0700
Subject: x86, x2apic: cleanup ifdef CONFIG_INTR_REMAP in io_apic code

Impact: cleanup

Clean up #ifdefs and replace them with helper functions.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c  | 44 ++++++++++-------------------------------
 arch/x86/kernel/apic/probe_64.c |  2 --
 2 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e074eac5bd3..cf27795c641 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -554,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 
 		apic = entry->apic;
 		pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
 		/*
 		 * With interrupt-remapping, destination information comes
 		 * from interrupt-remapping table entry.
 		 */
 		if (!irq_remapped(irq))
 			io_apic_write(apic, 0x11 + pin*2, dest);
-#else
-		io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -1419,9 +1415,8 @@ void __setup_vector_irq(int cpu)
 }
 
 static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip;
-#endif
+static struct irq_chip msi_ir_chip;
 
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -1460,7 +1455,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 	else
 		desc->status &= ~IRQ_LEVEL;
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		desc->status |= IRQ_MOVE_PCNTXT;
 		if (trigger)
@@ -1472,7 +1466,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 						      handle_edge_irq, "edge");
 		return;
 	}
-#endif
+
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1493,7 +1487,6 @@ int setup_ioapic_entry(int apic_id, int irq,
 	 */
 	memset(entry,0,sizeof(*entry));
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
 		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
 		struct irte irte;
@@ -1535,9 +1528,7 @@ int setup_ioapic_entry(int apic_id, int irq,
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
 		ir_entry->vector = pin;
-	} else
-#endif
-	{
+	} else {
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
 		entry->dest = destination;
@@ -1662,10 +1653,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 {
 	struct IO_APIC_route_entry entry;
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled)
 		return;
-#endif
 
 	memset(&entry, 0, sizeof(entry));
 
@@ -2395,6 +2384,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
 
 	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+						   const struct cpumask *mask)
+{
+}
 #endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2883,10 +2877,8 @@ static inline void __init check_timer(void)
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("BIOS bug: timer not connected to IO-APIC");
-#endif
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -2922,10 +2914,8 @@ static inline void __init check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
 		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
@@ -3219,9 +3209,7 @@ void destroy_irq(unsigned int irq)
 	if (desc)
 		desc->chip_data = cfg;
 
-#ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
-#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -3247,7 +3235,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irte irte;
 		int ir_index;
@@ -3273,9 +3260,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 				  MSI_ADDR_IR_SHV |
 				  MSI_ADDR_IR_INDEX1(ir_index) |
 				  MSI_ADDR_IR_INDEX2(ir_index);
-	} else
-#endif
-	{
+	} else {
 		if (x2apic_enabled())
 			msg->address_hi = MSI_ADDR_BASE_HI |
 					  MSI_ADDR_EXT_DEST_ID(dest);
@@ -3392,6 +3377,7 @@ static struct irq_chip msi_ir_chip = {
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
+#endif
 
 /*
  * Map the PCI dev to the corresponding remapping hardware unit
@@ -3419,7 +3405,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 	}
 	return index;
 }
-#endif
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
@@ -3433,7 +3418,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irq_desc *desc = irq_to_desc(irq);
 		/*
@@ -3442,7 +3426,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 		desc->status |= IRQ_MOVE_PCNTXT;
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
-#endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3456,11 +3439,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret, sub_handle;
 	struct msi_desc *msidesc;
 	unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
 	struct intel_iommu *iommu = 0;
 	int index = 0;
-#endif
 
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
@@ -3469,7 +3449,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
-#ifdef CONFIG_INTR_REMAP
 		if (!intr_remapping_enabled)
 			goto no_ir;
 
@@ -3497,7 +3476,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
-#endif
 		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
@@ -4032,11 +4010,9 @@ void __init setup_ioapic_dest(void)
 			else
 				mask = apic->target_cpus();
 
-#ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
 				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
-#endif
 				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8297c2b8ed2..1783652bb0e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -69,14 +69,12 @@ void __init default_setup_apic_routing(void)
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
 
-#ifdef CONFIG_X86_X2APIC
 	/*
 	 * Now that apic routing model is selected, configure the
 	 * fault handling for intr remapping.
 	 */
 	if (intr_remapping_enabled)
 		enable_drhd_fault_handling();
-#endif
 }
 
 /* Same for both flat and physical. */
-- 
cgit v1.2.3


From 05c3dc2c4b60387769cbe73174347de4cf85f0c9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:03 -0700
Subject: x86, ioapic: Fix non atomic allocation with interrupts disabled

Impact: fix possible race

save_mask_IO_APIC_setup() was using non atomic memory allocation while getting
called with interrupts disabled. Fix this by splitting this into two different
function. Allocation part save_IO_APIC_setup() now happens before
disabling interrupts.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c    | 11 ++++++-----
 arch/x86/kernel/apic/io_apic.c | 34 +++++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 699f8cf76bb..85eb8e10081 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1334,15 +1334,16 @@ void __init enable_IR_x2apic(void)
 		return;
 	}
 
-	local_irq_save(flags);
-	mask_8259A();
-
-	ret = save_mask_IO_APIC_setup();
+	ret = save_IO_APIC_setup();
 	if (ret) {
 		pr_info("Saving IO-APIC state failed: %d\n", ret);
 		goto end;
 	}
 
+	local_irq_save(flags);
+	mask_IO_APIC_setup();
+	mask_8259A();
+
 	ret = enable_intr_remapping(1);
 
 	if (ret && x2apic_preenabled) {
@@ -1367,10 +1368,10 @@ end_restore:
 	else
 		reinit_intr_remapped_IO_APIC(x2apic_preenabled);
 
-end:
 	unmask_8259A();
 	local_irq_restore(flags);
 
+end:
 	if (!ret) {
 		if (!x2apic_preenabled)
 			pr_info("Enabled x2apic and interrupt-remapping\n");
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index cf27795c641..ff1759a1128 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -853,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup);
 static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
 
 /*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
  */
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(void)
 {
 	union IO_APIC_reg_01 reg_01;
 	unsigned long flags;
@@ -880,16 +880,9 @@ int save_mask_IO_APIC_setup(void)
 	}
 
 	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			struct IO_APIC_route_entry entry;
-
-			entry = early_ioapic_entries[apic][pin] =
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			early_ioapic_entries[apic][pin] =
 				ioapic_read_entry(apic, pin);
-			if (!entry.mask) {
-				entry.mask = 1;
-				ioapic_write_entry(apic, pin, entry);
-			}
-		}
 
 	return 0;
 
@@ -902,6 +895,25 @@ nomem:
 	return -ENOMEM;
 }
 
+void mask_IO_APIC_setup(void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!early_ioapic_entries[apic])
+			break;
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			struct IO_APIC_route_entry entry;
+
+			entry = early_ioapic_entries[apic][pin];
+			if (!entry.mask) {
+				entry.mask = 1;
+				ioapic_write_entry(apic, pin, entry);
+			}
+		}
+	}
+}
+
 void restore_IO_APIC_setup(void)
 {
 	int apic, pin;
-- 
cgit v1.2.3


From 68a8ca593fac82e336a792226272455901fa83df Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 Mar 2009 17:05:04 -0700
Subject: x86: fix broken irq migration logic while cleaning up multiple
 vectors

Impact: fix spurious IRQs

During irq migration, we send a low priority interrupt to the previous
irq destination. This happens in non interrupt-remapping case after interrupt
starts arriving at new destination and in interrupt-remapping case after
modifying and flushing the interrupt-remapping table entry caches.

This low priority irq cleanup handler can cleanup multiple vectors, as
multiple irq's can be migrated at almost the same time. While
there will be multiple invocations of irq cleanup handler (one cleanup
IPI for each irq migration), first invocation of the cleanup handler
can potentially cleanup more than one vector (as the first invocation can
see the requests for more than vector cleanup). When we cleanup multiple
vectors during the first invocation of the smp_irq_move_cleanup_interrupt(),
other vectors that are to be cleanedup can still be pending in the local
cpu's IRR (as smp_irq_move_cleanup_interrupt() runs with interrupts disabled).

When we are ready to unhook a vector corresponding to an irq, check if that
vector is registered in the local cpu's IRR. If so skip that cleanup and
do a self IPI with the cleanup vector, so that we give a chance to
service the pending vector interrupt and then cleanup that vector
allocation once we execute the lowest priority handler.

This fixes spurious interrupts seen when migrating multiple vectors
at the same time.

[ This is apparently possible even on conventional xapic, although to
  the best of our knowledge it has never been seen.  The stable
  maintainers may wish to consider this one for -stable. ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: stable@kernel.org
---
 arch/x86/kernel/apic/io_apic.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ff1759a1128..42cdc78427a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2414,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irq;
+		unsigned int irr;
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
@@ -2433,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;
 
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		/*
+		 * Check if the vector that needs to be cleanedup is
+		 * registered at the cpu's IRR. If so, then this is not
+		 * the best time to clean it up. Lets clean it up in the
+		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+		 * to myself.
+		 */
+		if (irr  & (1 << (vector % 32))) {
+			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			goto unlock;
+		}
 		__get_cpu_var(vector_irq)[vector] = -1;
 		cfg->move_cleanup_count--;
 unlock:
-- 
cgit v1.2.3


From a6b6a14e0c60561f2902b078bd28d0e61defad70 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 18 Mar 2009 10:40:25 +1030
Subject: x86: use smp_call_function_single() in
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c

Attempting to rid us of the problematic work_on_cpu().  Just use
smp_call_function_single() here.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090318042217.EF3F1DDF39@ozlabs.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 40 ++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index c5a32f92d07..7d01be86887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -92,7 +92,8 @@ struct thresh_restart {
 };
 
 /* must be called with correct cpu affinity */
-static long threshold_restart_bank(void *_tr)
+/* Called via smp_call_function_single() */
+static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
 	u32 mci_misc_hi, mci_misc_lo;
@@ -119,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
 
 	mci_misc_hi |= MASK_COUNT_EN_HI;
 	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-	return 0;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
@@ -279,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
 	tr.b = b;
 	tr.reset = 0;
 	tr.old_limit = 0;
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
 	return end - buf;
 }
@@ -301,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 	tr.b = b;
 	tr.reset = 0;
 
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
 	return end - buf;
 }
 
-static long local_error_count(void *_b)
+struct threshold_block_cross_cpu {
+	struct threshold_block *tb;
+	long retval;
+};
+
+static void local_error_count_handler(void *_tbcc)
 {
-	struct threshold_block *b = _b;
+	struct threshold_block_cross_cpu *tbcc = _tbcc;
+	struct threshold_block *b = tbcc->tb;
 	u32 low, high;
 
 	rdmsr(b->address, low, high);
-	return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+	tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
 }
 
 static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
-	return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
+	struct threshold_block_cross_cpu tbcc = { .tb = b, };
+
+	smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
+	return sprintf(buf, "%lx\n", tbcc.retval);
 }
 
 static ssize_t store_error_count(struct threshold_block *b,
@@ -325,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
 {
 	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
 
-	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 	return 1;
 }
 
@@ -394,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
 		return 0;
 
-	if (rdmsr_safe(address, &low, &high))
+	if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
 		return 0;
 
 	if (!(high & MASK_VALID_HI)) {
@@ -458,12 +467,11 @@ out_free:
 	return err;
 }
 
-static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+static __cpuinit long
+local_allocate_threshold_blocks(int cpu, unsigned int bank)
 {
-	unsigned int *bank = _bank;
-
-	return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
-					 MSR_IA32_MC0_MISC + *bank * 4);
+	return allocate_threshold_blocks(cpu, bank, 0,
+					 MSR_IA32_MC0_MISC + bank * 4);
 }
 
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
@@ -526,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 	per_cpu(threshold_banks, cpu)[bank] = b;
 
-	err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
+	err = local_allocate_threshold_blocks(cpu, bank);
 	if (err)
 		goto out_free;
 
-- 
cgit v1.2.3


From ce4e240c279a31096f74afa6584a62d64a1ba8c8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 17 Mar 2009 10:16:54 -0800
Subject: x86: add x2apic_wrmsr_fence() to x2apic flush tlb paths

Impact: optimize APIC IPI related barriers

Uncached MMIO accesses for xapic are inherently serializing and hence
we don't need explicit barriers for xapic IPI paths.

x2apic MSR writes/reads don't have serializing semantics and hence need
a serializing instruction or mfence, to make all the previous memory
stores globally visisble before the x2apic msr write for IPI.

Add x2apic_wrmsr_fence() in flush tlb path to x2apic specific paths.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "steiner@sgi.com" <steiner@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <1237313814.27006.203.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 6 ++++++
 arch/x86/kernel/apic/x2apic_phys.c    | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8fb87b6dd63..4a903e2f0d1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		__x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_online_cpu(query_cpu) {
 		if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 23625b9f98b..a284359627e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
 		if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
 	unsigned long query_cpu;
 	unsigned long flags;
 
+	x2apic_wrmsr_fence();
+
 	local_irq_save(flags);
 	for_each_online_cpu(query_cpu) {
 		if (query_cpu == this_cpu)
-- 
cgit v1.2.3


From 2c74d66624ddbda8101d54d1e184cf9229b378bc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 18 Mar 2009 08:22:30 +1030
Subject: x86, uv: fix cpumask iterator in uv_bau_init()

Impact: fix boot crash on UV systems

Commit 76ba0ecda0de9accea9a91cb6dbde46782110e1c "cpumask: use
cpumask_var_t in uv_flush_tlb_others" used cur_cpu as an iterator;
it was supposed to be zero for the code below it.

Reported-by: Cliff Wickman <cpw@sgi.com>
Original-From: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Mike Travis <travis@sgi.com>
Cc: steiner@sgi.com
Cc: <stable@kernel.org>
LKML-Reference: <200903180822.31196.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d038b9c45cf..79c07324728 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -750,7 +750,7 @@ static int __init uv_bau_init(void)
 	int node;
 	int nblades;
 	int last_blade;
-	int cur_cpu = 0;
+	int cur_cpu;
 
 	if (!is_uv_system())
 		return 0;
@@ -760,6 +760,7 @@ static int __init uv_bau_init(void)
 	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
 	nblades = 0;
 	last_blade = -1;
+	cur_cpu = 0;
 	for_each_online_node(node) {
 		blade = uv_node_to_blade_id(node);
 		if (blade == last_blade)
-- 
cgit v1.2.3


From 4e16c888754468a58d4829c2eba2c6aa3a065e2b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 18 Mar 2009 17:36:55 +0530
Subject: x86: cpu/mttr/cleanup.c fix compilation warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kernel/cpu/mtrr/cleanup.c:197: warning: format ‘%d’ expects type ‘int’, but argument 2 has type ‘long unsigned int’

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1237378015.13488.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index f4f89fbc228..ce0fe4b5c04 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -160,8 +160,9 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 		       unsigned long extra_remove_base,
 		       unsigned long extra_remove_size)
 {
-	unsigned long i, base, size;
+	unsigned long base, size;
 	mtrr_type type;
+	int i;
 
 	for (i = 0; i < num_var_ranges; i++) {
 		type = range_state[i].type;
-- 
cgit v1.2.3


From cde5edbda8ba7d600154ce4171125a48e4d2a21b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 18 Mar 2009 17:37:45 +0530
Subject: x86: kprobes.c fix compilation warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kernel/kprobes.c:196: warning: passing argument 1 of ‘search_exception_tables’ makes integer from pointer without a cast

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference:<49BED952.2050809@redhat.com>
LKML-Reference: <1237378065.13488.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 4558dd3918c..55b94614e34 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
 
-	if (search_exception_tables(opcodes))
+	if (search_exception_tables((unsigned long)opcodes))
 		return 0;	/* Page fault may occur on this address. */
 
 retry:
-- 
cgit v1.2.3


From a6830278568a8bb9758aac152db15187741e0113 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Wed, 18 Mar 2009 20:42:28 +0530
Subject: x86: mpparse: clean up code by introducing a few helper functions

Impact: cleanup

Refactor the MP-table parsing code via the introduction of the
following helper functions:

  skip_entry()
  smp_reserve_bootmem()
  check_irq_src()
  check_slot()

To simplify the code flow and to reduce the size of the
following oversized functions: smp_read_mpc(), smp_scan_config().

There should be no impact to functionality.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 261 +++++++++++++++++++++-------------------------
 1 file changed, 120 insertions(+), 141 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 47673e02ae5..58ddf6259af 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
 
 static int bad_ioapic(unsigned long address)
 {
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
 
-#endif
 
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
@@ -275,6 +276,12 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 	return 1;
 }
 
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+	*ptr += size;
+	*count += size;
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -310,55 +317,27 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				/* ACPI may have already provided this data */
-				if (!acpi_lapic)
-					MP_processor_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			/* ACPI may have already provided this data */
+			if (!acpi_lapic)
+				MP_processor_info((struct mpc_cpu *)&mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
-				MP_bus_info(m);
-#endif
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_bus_info((struct mpc_bus *)&mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
-				MP_ioapic_info(m);
-#endif
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			MP_ioapic_info((struct mpc_ioapic *)&mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				MP_intsrc_info(m);
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			MP_intsrc_info((struct mpc_intsrc *)&mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				MP_lintsrc_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_lintsrc_info((struct mpc_lintsrc *)&mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
 			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -689,6 +668,31 @@ void __init get_smp_config(void)
 	__get_smp_config(0);
 }
 
+static void smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+	unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+	/*
+	 * We cannot access to MPC table to compute table size yet,
+	 * as only few megabytes from the bottom is mapped now.
+	 * PC-9800's MPC table places on the very last of physical
+	 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+	 * yields BUG() in reserve_bootmem.
+	 * also need to make sure physptr is below than max_low_pfn
+	 * we don't need reserve the area above max_low_pfn
+	 */
+	unsigned long end = max_low_pfn * PAGE_SIZE;
+
+	if (mpf->physptr < end) {
+		if (mpf->physptr + size > end)
+			size = end - mpf->physptr;
+		reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+	}
+#else
+	reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+
 static int __init smp_scan_config(unsigned long base, unsigned long length,
 				  unsigned reserve)
 {
@@ -717,35 +721,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			if (!reserve)
 				return 1;
 			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
-					BOOTMEM_DEFAULT);
-			if (mpf->physptr) {
-				unsigned long size = get_mpc_size(mpf->physptr);
-#ifdef CONFIG_X86_32
-				/*
-				 * We cannot access to MPC table to compute
-				 * table size yet, as only few megabytes from
-				 * the bottom is mapped now.
-				 * PC-9800's MPC table places on the very last
-				 * of physical memory; so that simply reserving
-				 * PAGE_SIZE from mpf->physptr yields BUG()
-				 * in reserve_bootmem.
-				 * also need to make sure physptr is below than
-				 * max_low_pfn
-				 * we don't need reserve the area above max_low_pfn
-				 */
-				unsigned long end = max_low_pfn * PAGE_SIZE;
-
-				if (mpf->physptr < end) {
-					if (mpf->physptr + size > end)
-						size = end - mpf->physptr;
-					reserve_bootmem_generic(mpf->physptr, size,
-							BOOTMEM_DEFAULT);
-				}
-#else
-				reserve_bootmem_generic(mpf->physptr, size,
 						BOOTMEM_DEFAULT);
-#endif
-			}
+			if (mpf->physptr)
+				smp_reserve_bootmem(mpf);
 
 			return 1;
 		}
@@ -848,7 +826,57 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
 #define SPARE_SLOT_NUM 20
 
 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+	int i;
+
+	apic_printk(APIC_VERBOSE, "OLD ");
+	print_MP_intsrc_info(m);
+
+	i = get_MP_intsrc_index(m);
+	if (i > 0) {
+		assign_to_mpc_intsrc(&mp_irqs[i], m);
+		apic_printk(APIC_VERBOSE, "NEW ");
+		print_mp_irq_info(&mp_irqs[i]);
+		return;
+	}
+	if (!i) {
+		/* legacy, do nothing */
+		return;
+	}
+	if (*nr_m_spare < SPARE_SLOT_NUM) {
+		/*
+		 * not found (-1), or duplicated (-2) are invalid entries,
+		 * we need to use the slot later
+		 */
+		m_spare[*nr_m_spare] = m;
+		*nr_m_spare += 1;
+	}
+}
+#else /* CONFIG_X86_IO_APIC */
+static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+		      int count)
+{
+	if (!mpc_new_phys) {
+		pr_info("No spare slots, try to append...take your risk, "
+			"new mpc_length %x\n", count);
+	} else {
+		if (count <= mpc_new_length)
+			pr_info("No spare slots, try to append..., "
+				"new mpc_length %x\n", count);
+		else {
+			pr_err("mpc_new_length %lx is too small\n",
+				mpc_new_length);
+			return -1;
+		}
+	}
+
+	return 0;
+}
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
 					unsigned long mpc_new_phys,
@@ -856,71 +884,30 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 {
 #ifdef CONFIG_X86_IO_APIC
 	int i;
-	int nr_m_spare = 0;
 #endif
-
 	int count = sizeof(*mpc);
+	int nr_m_spare = 0;
 	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
 	printk(KERN_INFO "mpc_length %x\n", mpc->length);
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				apic_printk(APIC_VERBOSE, "OLD ");
-				print_MP_intsrc_info(m);
-				i = get_MP_intsrc_index(m);
-				if (i > 0) {
-					assign_to_mpc_intsrc(&mp_irqs[i], m);
-					apic_printk(APIC_VERBOSE, "NEW ");
-					print_mp_irq_info(&mp_irqs[i]);
-				} else if (!i) {
-					/* legacy, do nothing */
-				} else if (nr_m_spare < SPARE_SLOT_NUM) {
-					/*
-					 * not found (-1), or duplicated (-2)
-					 * are invalid entries,
-					 * we need to use the slot  later
-					 */
-					m_spare[nr_m_spare] = m;
-					nr_m_spare++;
-				}
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			check_irq_src((struct mpc_intsrc *)&mpt, &nr_m_spare);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
 			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -950,16 +937,8 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 		} else {
 			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 			count += sizeof(struct mpc_intsrc);
-			if (!mpc_new_phys) {
-				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
-			} else {
-				if (count <= mpc_new_length)
-					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
-				else {
-					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
-					goto out;
-				}
-			}
+			if (!check_slot(mpc_new_phys, mpc_new_length, count))
+				goto out;
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
 			mpc->length = count;
 			mpt += sizeof(struct mpc_intsrc);
-- 
cgit v1.2.3


From c58603e81b3ed4f1c7352e091fe43fd0bd8d06cc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Mar 2009 08:50:35 +0100
Subject: x86: mpparse: clean up code by introducing a few helper functions,
 fix

Impact: fix boot crash

This fixes commit a6830278568a8bb9758aac152db15187741e0113.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1237403503.22438.21.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 58ddf6259af..290cb57f469 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -319,23 +319,23 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 		case MP_PROCESSOR:
 			/* ACPI may have already provided this data */
 			if (!acpi_lapic)
-				MP_processor_info((struct mpc_cpu *)&mpt);
+				MP_processor_info((struct mpc_cpu *)mpt);
 			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
 			break;
 		case MP_BUS:
-			MP_bus_info((struct mpc_bus *)&mpt);
+			MP_bus_info((struct mpc_bus *)mpt);
 			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
 			break;
 		case MP_IOAPIC:
-			MP_ioapic_info((struct mpc_ioapic *)&mpt);
+			MP_ioapic_info((struct mpc_ioapic *)mpt);
 			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
 			break;
 		case MP_INTSRC:
-			MP_intsrc_info((struct mpc_intsrc *)&mpt);
+			MP_intsrc_info((struct mpc_intsrc *)mpt);
 			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
 			break;
 		case MP_LINTSRC:
-			MP_lintsrc_info((struct mpc_lintsrc *)&mpt);
+			MP_lintsrc_info((struct mpc_lintsrc *)mpt);
 			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
 			break;
 		default:
@@ -902,7 +902,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
 			break;
 		case MP_INTSRC:
-			check_irq_src((struct mpc_intsrc *)&mpt, &nr_m_spare);
+			check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
 			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
 			break;
 		case MP_LINTSRC:
-- 
cgit v1.2.3


From 71ff49d71bb5cfcd2689b54cb433c0e6990a1d86 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Mar 2009 13:03:33 -0700
Subject: x86: with the last user gone, remove set_pte_present

Impact: cleanup

set_pte_present() is no longer used, directly or indirectly,
so remove it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
LKML-Reference: <1237406613-2929-2-git-send-email-jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kvm.c      | 7 -------
 arch/x86/kernel/paravirt.c | 1 -
 arch/x86/kernel/vmi_32.c   | 6 ------
 3 files changed, 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986ec..33019ddb56b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
 	kvm_mmu_write(ptep, pte_val(pte));
 }
 
-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
 static void kvm_pte_clear(struct mm_struct *mm,
 			  unsigned long addr, pte_t *ptep)
 {
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
-		pv_mmu_ops.set_pte_present = kvm_set_pte_present;
 		pv_mmu_ops.pte_clear = kvm_pte_clear;
 		pv_mmu_ops.pmd_clear = kvm_pmd_clear;
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee..8e45f446488 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
-	.set_pte_present = native_set_pte_present,
 	.pte_clear = native_pte_clear,
 	.pmd_clear = native_pmd_clear,
 #endif
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb..95deb9f2211 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
 	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
 }
 
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
 static void vmi_set_pud(pud_t *pudp, pud_t pudval)
 {
 	/* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
 		pv_mmu_ops.set_pmd = vmi_set_pmd;
 #ifdef CONFIG_X86_PAE
 		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-		pv_mmu_ops.set_pte_present = vmi_set_pte_present;
 		pv_mmu_ops.set_pud = vmi_set_pud;
 		pv_mmu_ops.pte_clear = vmi_pte_clear;
 		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-- 
cgit v1.2.3


From 14fc9fbc700dc95b4f46ebd588169324fe6deff8 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 19 Mar 2009 10:56:29 -0700
Subject: x86: signal: check signal stack overflow properly

Impact: cleanup

Check alternate signal stack overflow with proper stack pointer.
The stack pointer of the next signal frame is different if that
task has i387 state.

On x86_64, redzone would be included.

No need to check SA_ONSTACK if we're already using alternate signal stack.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <49C2874D.3080002@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/signal.c | 48 +++++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d2cc6428c58..dfcc74ab0ab 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -211,31 +211,27 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 {
 	/* Default to using normal stack */
 	unsigned long sp = regs->sp;
+	int onsigstack = on_sig_stack(sp);
 
 #ifdef CONFIG_X86_64
 	/* redzone */
 	sp -= 128;
 #endif /* CONFIG_X86_64 */
 
-	/*
-	 * If we are on the alternate signal stack and would overflow it, don't.
-	 * Return an always-bogus address instead so we will die with SIGSEGV.
-	 */
-	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
-		return (void __user *) -1L;
-
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
-			sp = current->sas_ss_sp + current->sas_ss_size;
-	} else {
+	if (!onsigstack) {
+		/* This is the X/Open sanctioned signal stack switching.  */
+		if (ka->sa.sa_flags & SA_ONSTACK) {
+			if (sas_ss_flags(sp) == 0)
+				sp = current->sas_ss_sp + current->sas_ss_size;
+		} else {
 #ifdef CONFIG_X86_32
-		/* This is the legacy signal stack switching. */
-		if ((regs->ss & 0xffff) != __USER_DS &&
-			!(ka->sa.sa_flags & SA_RESTORER) &&
-				ka->sa.sa_restorer)
-			sp = (unsigned long) ka->sa.sa_restorer;
+			/* This is the legacy signal stack switching. */
+			if ((regs->ss & 0xffff) != __USER_DS &&
+				!(ka->sa.sa_flags & SA_RESTORER) &&
+					ka->sa.sa_restorer)
+				sp = (unsigned long) ka->sa.sa_restorer;
 #endif /* CONFIG_X86_32 */
+		}
 	}
 
 	if (used_math()) {
@@ -244,12 +240,22 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		sp = round_down(sp, 64);
 #endif /* CONFIG_X86_64 */
 		*fpstate = (void __user *)sp;
-
-		if (save_i387_xstate(*fpstate) < 0)
-			return (void __user *)-1L;
 	}
 
-	return (void __user *)align_sigframe(sp - frame_size);
+	sp = align_sigframe(sp - frame_size);
+
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (onsigstack && !likely(on_sig_stack(sp)))
+		return (void __user *)-1L;
+
+	/* save i387 state */
+	if (used_math() && save_i387_xstate(*fpstate) < 0)
+		return (void __user *)-1L;
+
+	return (void __user *)sp;
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 04c93ce4991fce731dab346d03964504339347db Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Mar 2009 21:02:55 +0100
Subject: x86: fix IO APIC resource allocation error message

Impact: fix incorrect error message

- IO APIC resource allocation error message contains one too many "be".

- Print the error message iff there are IO APICs in the system.

I've seen this error message for some time on my x86-32 laptop...

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Alan Bartlett <ajb.stxsl@googlemail.com>
LKML-Reference: <200903202100.30789.bzolnier@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 42cdc78427a..d882c03604e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4130,9 +4130,12 @@ static int __init ioapic_insert_resources(void)
 	struct resource *r = ioapic_resources;
 
 	if (!r) {
-		printk(KERN_ERR
-		       "IO APIC resources could be not be allocated.\n");
-		return -1;
+		if (nr_ioapics > 0) {
+			printk(KERN_ERR
+				"IO APIC resources couldn't be allocated.\n");
+			return -1;
+		}
+		return 0;
 	}
 
 	for (i = 0; i < nr_ioapics; i++) {
-- 
cgit v1.2.3


From 5a5737eac224f01e264477954d92ed6e69047b7a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 13:28:39 +0530
Subject: x86: mpparse.c introduce smp_dump_mptable helper function

smp_read_mpc() and replace_intsrc_all() can use same smp_dump_mptable()

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/mpparse.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 290cb57f469..4216d265366 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -282,6 +282,14 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
 	*count += size;
 }
 
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+	printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+		"type %x\n", *mpt);
+	print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+			1, mpc, mpc->length, 1);
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -340,10 +348,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			count = mpc->length;
 			break;
 		}
@@ -910,10 +915,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From 0b3ba0c3ccc7ced2a06fed405e80c8e1c77a3ee7 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 13:43:20 +0530
Subject: x86: mpparse.c introduce check_physptr helper function

To reduce the size of the oversized function __get_smp_config()

There should be no impact to functionality.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/mpparse.c | 94 +++++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 44 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 4216d265366..dce99dca6cf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -555,6 +555,55 @@ static unsigned long __init get_mpc_size(unsigned long physptr)
 	return size;
 }
 
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_ioremap(mpf->physptr, size);
+	/*
+	 * Read the physical hardware table.  Anything here will
+	 * override the defaults.
+	 */
+	if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+		smp_found_config = 0;
+#endif
+		printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+			"... disabling SMP support. (tell your hw vendor)\n");
+		early_iounmap(mpc, size);
+		return -1;
+	}
+	early_iounmap(mpc, size);
+
+	if (early)
+		return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * If there are no explicit MP IRQ entries, then we are
+	 * broken.  We set up most of the low 16 IO-APIC pins to
+	 * ISA defaults and hope it will work.
+	 */
+	if (!mp_irq_entries) {
+		struct mpc_bus bus;
+
+		printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+		       "using default mptable. (tell your hw vendor)\n");
+
+		bus.type = MP_BUS;
+		bus.busid = 0;
+		memcpy(bus.bustype, "ISA   ", 6);
+		MP_bus_info(&bus);
+
+		construct_default_ioirq_mptable(0);
+	}
+#endif
+
+	return 0;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -608,51 +657,8 @@ static void __init __get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
-		struct mpc_table *mpc;
-		unsigned long size;
-
-		size = get_mpc_size(mpf->physptr);
-		mpc = early_ioremap(mpf->physptr, size);
-		/*
-		 * Read the physical hardware table.  Anything here will
-		 * override the defaults.
-		 */
-		if (!smp_read_mpc(mpc, early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
-			smp_found_config = 0;
-#endif
-			printk(KERN_ERR
-			       "BIOS bug, MP table errors detected!...\n");
-			printk(KERN_ERR "... disabling SMP support. "
-			       "(tell your hw vendor)\n");
-			early_iounmap(mpc, size);
-			return;
-		}
-		early_iounmap(mpc, size);
-
-		if (early)
+		if (check_physptr(mpf, early))
 			return;
-#ifdef CONFIG_X86_IO_APIC
-		/*
-		 * If there are no explicit MP IRQ entries, then we are
-		 * broken.  We set up most of the low 16 IO-APIC pins to
-		 * ISA defaults and hope it will work.
-		 */
-		if (!mp_irq_entries) {
-			struct mpc_bus bus;
-
-			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-			       "using default mptable. "
-			       "(tell your hw vendor)\n");
-
-			bus.type = MP_BUS;
-			bus.busid = 0;
-			memcpy(bus.bustype, "ISA   ", 6);
-			MP_bus_info(&bus);
-
-			construct_default_ioirq_mptable(0);
-		}
-#endif
 	} else
 		BUG();
 
-- 
cgit v1.2.3


From 271eb5c588e5d0a41eca6e118b6927af3f0f04b9 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 16:55:24 +0530
Subject: x86: topology.c cleanup

Impact: cleanup

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/topology.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f..7e4515957a1 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
-#include <linux/init.h>
-#include <linux/smp.h>
 #include <linux/nodemask.h>
 #include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
 	 */
 	if (num)
 		per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
 EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
 	unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
-#else
+#else /* CONFIG_HOTPLUG_CPU */
+
 static int __init arch_register_cpu(int num)
 {
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
-#endif /*CONFIG_HOTPLUG_CPU*/
+#endif /* CONFIG_HOTPLUG_CPU */
 
 static int __init topology_init(void)
 {
@@ -70,11 +72,11 @@ static int __init topology_init(void)
 #ifdef CONFIG_NUMA
 	for_each_online_node(i)
 		register_one_node(i);
-#endif /* CONFIG_NUMA */
+#endif
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
+
 	return 0;
 }
-
 subsys_initcall(topology_init);
-- 
cgit v1.2.3


From 390cd85c8ae4dc54ffba460a0e6575889170f56e Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 16:55:45 +0530
Subject: x86: kdebugfs.c cleanup

Impact: cleanup

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/kdebugfs.c | 82 ++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 46 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f..e444357375c 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
  */
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-#include <linux/stat.h>
+#include <linux/module.h>
 #include <linux/init.h>
+#include <linux/stat.h>
 #include <linux/io.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 
 #include <asm/setup.h>
 
@@ -26,9 +26,8 @@ struct setup_data_node {
 	u32 len;
 };
 
-static ssize_t
-setup_data_read(struct file *file, char __user *user_buf, size_t count,
-		loff_t *ppos)
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
 {
 	struct setup_data_node *node = file->private_data;
 	unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 
 	if (pos < 0)
 		return -EINVAL;
+
 	if (pos >= node->len)
 		return 0;
 
 	if (count > node->len - pos)
 		count = node->len - pos;
+
 	pa = node->paddr + sizeof(struct setup_data) + pos;
 	pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
 	if (PageHighMem(pg)) {
 		p = ioremap_cache(pa, count);
 		if (!p)
 			return -ENXIO;
-	} else {
+	} else
 		p = __va(pa);
-	}
 
 	remain = copy_to_user(user_buf, p, count);
 
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 static int setup_data_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
+
 	return 0;
 }
 
 static const struct file_operations fops_setup_data = {
-	.read =		setup_data_read,
-	.open =		setup_data_open,
+	.read		= setup_data_read,
+	.open		= setup_data_open,
 };
 
 static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
 {
 	struct dentry *d, *type, *data;
 	char buf[16];
-	int error;
 
 	sprintf(buf, "%d", no);
 	d = debugfs_create_dir(buf, parent);
-	if (!d) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!d)
+		return -ENOMEM;
+
 	type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
-	if (!type) {
-		error = -ENOMEM;
+	if (!type)
 		goto err_dir;
-	}
+
 	data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
-	if (!data) {
-		error = -ENOMEM;
+	if (!data)
 		goto err_type;
-	}
+
 	return 0;
 
 err_type:
 	debugfs_remove(type);
 err_dir:
 	debugfs_remove(d);
-err_return:
-	return error;
+	return -ENOMEM;
 }
 
 static int __init create_setup_data_nodes(struct dentry *parent)
 {
 	struct setup_data_node *node;
 	struct setup_data *data;
-	int error, no = 0;
+	int error = -ENOMEM;
 	struct dentry *d;
 	struct page *pg;
 	u64 pa_data;
+	int no = 0;
 
 	d = debugfs_create_dir("setup_data", parent);
-	if (!d) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!d)
+		return -ENOMEM;
 
 	pa_data = boot_params.hdr.setup_data;
 
 	while (pa_data) {
 		node = kmalloc(sizeof(*node), GFP_KERNEL);
-		if (!node) {
-			error = -ENOMEM;
+		if (!node)
 			goto err_dir;
-		}
+
 		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
 		if (PageHighMem(pg)) {
 			data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 				error = -ENXIO;
 				goto err_dir;
 			}
-		} else {
+		} else
 			data = __va(pa_data);
-		}
 
 		node->paddr = pa_data;
 		node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 			goto err_dir;
 		no++;
 	}
+
 	return 0;
 
 err_dir:
 	debugfs_remove(d);
-err_return:
 	return error;
 }
 
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
 static int __init boot_params_kdebugfs_init(void)
 {
 	struct dentry *dbp, *version, *data;
-	int error;
+	int error = -ENOMEM;
 
 	dbp = debugfs_create_dir("boot_params", NULL);
-	if (!dbp) {
-		error = -ENOMEM;
-		goto err_return;
-	}
+	if (!dbp)
+		return -ENOMEM;
+
 	version = debugfs_create_x16("version", S_IRUGO, dbp,
 				     &boot_params.hdr.version);
-	if (!version) {
-		error = -ENOMEM;
+	if (!version)
 		goto err_dir;
-	}
+
 	data = debugfs_create_blob("data", S_IRUGO, dbp,
 				   &boot_params_blob);
-	if (!data) {
-		error = -ENOMEM;
+	if (!data)
 		goto err_version;
-	}
+
 	error = create_setup_data_nodes(dbp);
 	if (error)
 		goto err_data;
+
 	return 0;
 
 err_data:
@@ -205,10 +196,9 @@ err_version:
 	debugfs_remove(version);
 err_dir:
 	debugfs_remove(dbp);
-err_return:
 	return error;
 }
-#endif
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
 
 static int __init arch_kdebugfs_init(void)
 {
-- 
cgit v1.2.3


From c8344bc218ce7ebc728337839698566a79edfe5a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 16:56:10 +0530
Subject: x86: i8253 cleanup

Impact: cleanup

 - fix various style problems
  - fix header file issues

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/i8253.c | 68 ++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f..3475440baa5 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
-#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
 
-#include <asm/smp.h>
-#include <asm/delay.h>
 #include <asm/i8253.h>
-#include <asm/io.h>
 #include <asm/hpet.h>
+#include <asm/smp.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 {
 	spin_lock(&i8253_lock);
 
-	switch(mode) {
+	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
 		outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
  * !using_apic_timer decisions in do_timer_interrupt_hook()
  */
-static struct clock_event_device pit_clockevent = {
+static struct clock_event_device pit_ce = {
 	.name		= "pit",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
 	 * Start pit with the boot cpu mask and make it global after the
 	 * IO_APIC has been initialized.
 	 */
-	pit_clockevent.cpumask = cpumask_of(smp_processor_id());
-	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
-				     pit_clockevent.shift);
-	pit_clockevent.max_delta_ns =
-		clockevent_delta2ns(0x7FFF, &pit_clockevent);
-	pit_clockevent.min_delta_ns =
-		clockevent_delta2ns(0xF, &pit_clockevent);
-	clockevents_register_device(&pit_clockevent);
-	global_clock_event = &pit_clockevent;
+	pit_ce.cpumask = cpumask_of(smp_processor_id());
+	pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
+	pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
+	pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
+
+	clockevents_register_device(&pit_ce);
+	global_clock_event = &pit_ce;
 }
 
 #ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
  */
 static cycle_t pit_read(void)
 {
+	static int old_count;
+	static u32 old_jifs;
 	unsigned long flags;
 	int count;
 	u32 jifs;
-	static int old_count;
-	static u32 old_jifs;
 
 	spin_lock_irqsave(&i8253_lock, flags);
 	/*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
 	 * Previous attempts to handle these cases intelligently were
 	 * buggy, so we just do the simple thing now.
 	 */
-	if (count > old_count && jifs == old_jifs) {
+	if (count > old_count && jifs == old_jifs)
 		count = old_count;
-	}
+
 	old_count = count;
 	old_jifs = jifs;
 
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
 	return (cycle_t)(jifs * LATCH) + count;
 }
 
-static struct clocksource clocksource_pit = {
-	.name	= "pit",
-	.rating = 110,
-	.read	= pit_read,
-	.mask	= CLOCKSOURCE_MASK(32),
-	.mult	= 0,
-	.shift	= 20,
+static struct clocksource pit_cs = {
+	.name		= "pit",
+	.rating		= 110,
+	.read		= pit_read,
+	.mask		= CLOCKSOURCE_MASK(32),
+	.mult		= 0,
+	.shift		= 20,
 };
 
 static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
 	/*
 	 * Use mult to check whether it is registered or not
 	 */
-	if (clocksource_pit.mult) {
-		clocksource_unregister(&clocksource_pit);
-		clocksource_pit.mult = 0;
+	if (pit_cs.mult) {
+		clocksource_unregister(&pit_cs);
+		pit_cs.mult = 0;
 	}
 }
 
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
-	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
-						   clocksource_pit.shift);
-	return clocksource_register(&clocksource_pit);
+	pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
+
+	return clocksource_register(&pit_cs);
 }
 arch_initcall(init_pit_clocksource);
 
-#endif
+#endif /* !CONFIG_X86_64 */
-- 
cgit v1.2.3


From 8383d821e7481f7cde9c17b61deb1b4af43b2cd9 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 16:56:37 +0530
Subject: x86: rtc.c cleanup

Impact: cleanup

 - fix various style problems
  - fix header file issues

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/rtc.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561..5d465b207e7 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
 /*
  * RTC related functions
  */
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
 #include <linux/bcd.h>
-#include <linux/mc146818rtc.h>
-#include <linux/platform_device.h>
 #include <linux/pnp.h>
 
-#include <asm/time.h>
 #include <asm/vsyscall.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_X86_32
 /*
@@ -16,9 +16,9 @@
  * register we are working with.  It is required for NMI access to the
  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
  */
-volatile unsigned long cmos_lock = 0;
+volatile unsigned long cmos_lock;
 EXPORT_SYMBOL(cmos_lock);
-#endif
+#endif /* CONFIG_X86_32 */
 
 /* For two digit years assume time is always after that */
 #define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
  */
 int mach_set_rtc_mmss(unsigned long nowtime)
 {
-	int retval = 0;
 	int real_seconds, real_minutes, cmos_minutes;
 	unsigned char save_control, save_freq_select;
+	int retval = 0;
 
 	 /* tell the clock it's being set */
 	save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 			real_seconds = bin2bcd(real_seconds);
 			real_minutes = bin2bcd(real_minutes);
 		}
-		CMOS_WRITE(real_seconds,RTC_SECONDS);
-		CMOS_WRITE(real_minutes,RTC_MINUTES);
+		CMOS_WRITE(real_seconds, RTC_SECONDS);
+		CMOS_WRITE(real_minutes, RTC_MINUTES);
 	} else {
 		printk(KERN_WARNING
 		       "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
 	outb(addr, RTC_PORT(0));
 	val = inb(RTC_PORT(1));
 	lock_cmos_suffix(addr);
+
 	return val;
 }
 EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
 
 static int set_rtc_mmss(unsigned long nowtime)
 {
-	int retval;
 	unsigned long flags;
+	int retval;
 
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
 		 "registered platform RTC device (no PNP device found)\n");
+
 	return 0;
 }
 device_initcall(add_rtc_cmos);
-- 
cgit v1.2.3


From d53a44446076a7dd68a4fb7f549fb6aeda9bfc2c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 16:57:04 +0530
Subject: x86: io_delay.c cleanup

Impact: cleanup

 - fix header file issues

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/io_delay.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aac..a979b5bd2fc 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 #include <linux/dmi.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
 
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
 static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
 {
 	if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
-		printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
-			id->ident);
+		pr_notice("%s: using 0xed I/O delay port\n", id->ident);
 		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
 	}
 
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "Compaq Presario V6000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B7")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B7")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv9000z",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B9")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B9")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv6000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30B8")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30B8")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion tx1000",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30BF")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30BF")
 		}
 	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "Presario F700",
 		.matches	= {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-			DMI_MATCH(DMI_BOARD_NAME, "30D3")
+			DMI_MATCH(DMI_BOARD_VENDOR,	"Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME,	"30D3")
 		}
 	},
 	{ }
-- 
cgit v1.2.3


From 1894e36754d682cc049b2b1c3825da8e585967d5 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 21 Mar 2009 17:01:25 +0530
Subject: x86: pci-nommu.c cleanup

Impact: cleanup

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/pci-nommu.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c..8b02a3936d4 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
 /* Fallback functions when the main IOMMU code is not compiled in. This
    code is roughly equivalent to i386. */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
 
-#include <asm/iommu.h>
 #include <asm/processor.h>
+#include <asm/iommu.h>
 #include <asm/dma.h>
 
 static int
@@ -79,11 +79,11 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
 }
 
 struct dma_mapping_ops nommu_dma_ops = {
-	.alloc_coherent = dma_generic_alloc_coherent,
-	.free_coherent = nommu_free_coherent,
-	.map_single = nommu_map_single,
-	.map_sg = nommu_map_sg,
-	.is_phys = 1,
+	.alloc_coherent	= dma_generic_alloc_coherent,
+	.free_coherent	= nommu_free_coherent,
+	.map_single	= nommu_map_single,
+	.map_sg		= nommu_map_sg,
+	.is_phys	= 1,
 };
 
 void __init no_iommu_init(void)
-- 
cgit v1.2.3


From 1cc185211a9cb913f6adbe3354e5c256f456ebd2 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:09 +0200
Subject: x86: Fix a couple of sparse warnings in
 arch/x86/kernel/apic/io_apic.c

Impact: cleanup

This patch fixes the following sparse warnings:

 arch/x86/kernel/apic/io_apic.c:3602:17: warning: symbol 'hpet_msi_type'
 was not declared. Should it be static?

 arch/x86/kernel/apic/io_apic.c:3467:30: warning: Using plain integer as
 NULL pointer

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-2-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 42cdc78427a..ea97e5efa90 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3464,7 +3464,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret, sub_handle;
 	struct msi_desc *msidesc;
 	unsigned int irq_want;
-	struct intel_iommu *iommu = 0;
+	struct intel_iommu *iommu = NULL;
 	int index = 0;
 
 	irq_want = nr_irqs_gsi;
@@ -3599,7 +3599,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 #endif /* CONFIG_SMP */
 
-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.unmask = hpet_msi_unmask,
 	.mask = hpet_msi_mask,
-- 
cgit v1.2.3


From f2362e6f1b9c5c168e5b4159afb4853ba467965e Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 23 Mar 2009 02:06:51 +0530
Subject: x86: cpu/cpu.h cleanup

Impact: cleanup

 - Fix various style issues

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/cpu/cpu.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 9469ecb5aeb..6de9a908e40 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,25 +3,25 @@
 #define ARCH_X86_CPU_H
 
 struct cpu_model_info {
-	int vendor;
-	int family;
-	const char *model_names[16];
+	int		vendor;
+	int		family;
+	const char	*model_names[16];
 };
 
 /* attempt to consolidate cpu attributes */
 struct cpu_dev {
-	const char	* c_vendor;
+	const char	*c_vendor;
 
 	/* some have two possibilities for cpuid string */
-	const char	* c_ident[2];
+	const char	*c_ident[2];
 
 	struct		cpu_model_info c_models[4];
 
-	void            (*c_early_init)(struct cpuinfo_x86 *c);
-	void		(*c_init)(struct cpuinfo_x86 * c);
-	void		(*c_identify)(struct cpuinfo_x86 * c);
-	unsigned int	(*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
-	int	c_x86_vendor;
+	void            (*c_early_init)(struct cpuinfo_x86 *);
+	void		(*c_init)(struct cpuinfo_x86 *);
+	void		(*c_identify)(struct cpuinfo_x86 *);
+	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+	int		c_x86_vendor;
 };
 
 #define cpu_dev_register(cpu_devX) \
-- 
cgit v1.2.3


From ce2d8bfd44c01fc9b22d64617b7e520e99095f33 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 23 Mar 2009 02:08:00 +0530
Subject: x86: irq.c use same path for show_interrupts

Impact: cleanup

SMP and !SMP will use same path for show_interrupts

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/irq.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b8ac3b6cf77..5e7c3e6f8f2 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -133,23 +133,15 @@ int show_interrupts(struct seq_file *p, void *v)
 		return 0;
 
 	spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
-	any_count = kstat_irqs(i);
-#else
 	for_each_online_cpu(j)
 		any_count |= kstat_irqs_cpu(i, j);
-#endif
 	action = desc->action;
 	if (!action && !any_count)
 		goto out;
 
 	seq_printf(p, "%*d: ", prec, i);
-#ifndef CONFIG_SMP
-	seq_printf(p, "%10u ", kstat_irqs(i));
-#else
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
 	seq_printf(p, " %8s", desc->chip->name);
 	seq_printf(p, "-%-8s", desc->name);
 
-- 
cgit v1.2.3


From 474e56b82c06cdbed468aea957805e0eb19d3510 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 23 Mar 2009 02:08:34 +0530
Subject: x86: irq.c keep CONFIG_X86_LOCAL_APIC interrupts together

Impact: cleanup

keep CONFIG_X86_LOCAL_APIC interrupts together to avoid extra ifdef

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/irq.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 5e7c3e6f8f2..3aaf7b9e3a8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -58,6 +58,11 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "SPU");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+	seq_printf(p, "  Spurious interrupts\n");
 #endif
 	if (generic_interrupt_extension) {
 		seq_printf(p, "PLT: ");
@@ -90,12 +95,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "%*s: ", prec, "SPU");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-	seq_printf(p, "  Spurious interrupts\n");
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
@@ -166,6 +165,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->irq_spurious_count;
 #endif
 	if (generic_interrupt_extension)
 		sum += irq_stats(cpu)->generic_irqs;
@@ -179,9 +179,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 # ifdef CONFIG_X86_64
 	sum += irq_stats(cpu)->irq_threshold_count;
 #endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	sum += irq_stats(cpu)->irq_spurious_count;
 #endif
 	return sum;
 }
-- 
cgit v1.2.3


From a1e38ca5ce1789de1bbd723e1e09de962e47ce18 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 23 Mar 2009 02:11:25 +0530
Subject: x86: apic/io_apic.c define msi_ir_chip and ir_ioapic_chip all the
 time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

move out msi_ir_chip and ir_ioapic_chip from CONFIG_INTR_REMAP shadow

Fix:
 arch/x86/kernel/apic/io_apic.c:1431: warning: ‘msi_ir_chip’ defined but not used

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/apic/io_apic.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 42cdc78427a..d36e3d8be0f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1428,7 +1428,6 @@ void __setup_vector_irq(int cpu)
 
 static struct irq_chip ioapic_chip;
 static struct irq_chip ir_ioapic_chip;
-static struct irq_chip msi_ir_chip;
 
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -2663,20 +2662,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
 	.startup	= startup_ioapic_irq,
 	.mask		= mask_IO_APIC_irq,
 	.unmask		= unmask_IO_APIC_irq,
+#ifdef CONFIG_INTR_REMAP
 	.ack		= ack_x2apic_edge,
 	.eoi		= ack_x2apic_level,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ir_ioapic_affinity_irq,
+#endif
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
-#endif
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -3391,18 +3390,18 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip msi_ir_chip = {
 	.name		= "IR-PCI-MSI",
 	.unmask		= unmask_msi_irq,
 	.mask		= mask_msi_irq,
+#ifdef CONFIG_INTR_REMAP
 	.ack		= ack_x2apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= ir_set_msi_irq_affinity,
+#endif
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
-#endif
 
 /*
  * Map the PCI dev to the corresponding remapping hardware unit
-- 
cgit v1.2.3


From ba639039d68cd978f4fa900a6533fe930609ed35 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 23 Mar 2009 02:13:01 +0530
Subject: x86: e820 fix various signedness issues in setup.c and e820.c

Impact: cleanup

This fixed various signedness issues in setup.c and e820.c:
arch/x86/kernel/setup.c:455:53: warning: incorrect type in argument 3 (different signedness)
arch/x86/kernel/setup.c:455:53:    expected int *pnr_map
arch/x86/kernel/setup.c:455:53:    got unsigned int extern [toplevel] *<noident>
arch/x86/kernel/setup.c:639:53: warning: incorrect type in argument 3 (different signedness)
arch/x86/kernel/setup.c:639:53:    expected int *pnr_map
arch/x86/kernel/setup.c:639:53:    got unsigned int extern [toplevel] *<noident>
arch/x86/kernel/setup.c:820:54: warning: incorrect type in argument 3 (different signedness)
arch/x86/kernel/setup.c:820:54:    expected int *pnr_map
arch/x86/kernel/setup.c:820:54:    got unsigned int extern [toplevel] *<noident>

arch/x86/kernel/e820.c:670:53: warning: incorrect type in argument 3 (different signedness)
arch/x86/kernel/e820.c:670:53:    expected int *pnr_map
arch/x86/kernel/e820.c:670:53:    got unsigned int [toplevel] *<noident>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/kernel/e820.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index fb638d9ce6d..ef2c3563357 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -233,7 +233,7 @@ void __init e820_print_map(char *who)
  */
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-				int *pnr_map)
+			     u32 *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -552,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 
 void __init update_e820(void)
 {
-	int nr_map;
+	u32 nr_map;
 
 	nr_map = e820.nr_map;
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -563,7 +563,7 @@ void __init update_e820(void)
 }
 static void __init update_e820_saved(void)
 {
-	int nr_map;
+	u32 nr_map;
 
 	nr_map = e820_saved.nr_map;
 	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1303,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
-		int nr = e820.nr_map;
+		u32 nr = e820.nr_map;
 
 		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
 			early_panic("Invalid user supplied memory map");
@@ -1386,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
 char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
-	int new_nr;
+	u32 new_nr;
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
-- 
cgit v1.2.3


From fa74c9073370e57fa28e02aff13f4d7b1806505c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 24 Mar 2009 13:23:16 -0700
Subject: x86: fix set_extra_move_desc calling

Impact: fix bug with irq-descriptor moving when logical flat

Rusty observed:

> The effect of setting desc->affinity (ie. from userspace via sysfs) has varied
> over time.  In 2.6.27, the 32-bit code anded the value with cpu_online_map,
> and both 32 and 64-bit did that anding whenever a cpu was unplugged.
>
> 2.6.29 consolidated this into one routine (and fixed hotplug) but introduced
> another variation: anding the affinity with cfg->domain.  Is this right, or
> should we just set it to what the user said?  Or as now, indicate that we're
> restricting it.

Eric pointed out that desc->affinity should be what the user requested,
if it is at all possible to honor the user space request.

This bug got introduced by commit 22f65d31b "x86: Update io_apic.c to use
new cpumask API".

Fix it by moving the masking to before the descriptor moving ...

Reported-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <49C94134.4000408@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 86827d85488..1ed6c0600cd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -592,8 +592,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	cpumask_and(desc->affinity, cfg->domain, mask);
+	/* check that before desc->addinity get updated */
 	set_extra_move_desc(desc, mask);
+	cpumask_and(desc->affinity, cfg->domain, mask);
 
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
-- 
cgit v1.2.3


From f56e5034121c4911a155ba907076ab920754626d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 24 Mar 2009 14:16:30 -0700
Subject: x86: use default_cpu_mask_to_apicid for 64bit

Impact: cleanup

Use online_mask directly on 64bit too.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49C94DAE.9070300@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic_flat_64.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f933822dba1..0014714ea97 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
 	return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
-static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-}
-
-static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-						const struct cpumask *andmask)
-{
-	unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-	unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
-
-	return mask1 & mask2;
-}
-
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
 	return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat =  {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xFFu << 24,
 
-	.cpu_mask_to_apicid		= flat_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= flat_send_IPI_mask,
 	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself,
-- 
cgit v1.2.3


From e06b1b56f9bfcc91e1f175fe8d8bf3e35dafa080 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 24 Mar 2009 14:17:19 -0700
Subject: x86: Correct behaviour of irq affinity

Impact: get correct smp_affinity as user requested

The effect of setting desc->affinity (ie. from userspace via sysfs) has
varied over time.  In 2.6.27, the 32-bit code anded the value with
cpu_online_map, and both 32 and 64-bit did that anding whenever a cpu
was unplugged.

2.6.29 consolidated this into one routine (and fixed hotplug) but
introduced another variation: anding the affinity with cfg->domain.

We should just set it to what the user said - if possible.

(cpu_mask_to_apicid_and already takes cpu_online_mask into account)

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49C94DDF.2010703@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1ed6c0600cd..d990408ca06 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -594,9 +594,10 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 
 	/* check that before desc->addinity get updated */
 	set_extra_move_desc(desc, mask);
-	cpumask_and(desc->affinity, cfg->domain, mask);
 
-	return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
+	cpumask_copy(desc->affinity, mask);
+
+	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
 static void
-- 
cgit v1.2.3


From 70511134f61bd6e5eed19f767381f9fb3e762d49 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 23 Mar 2009 23:14:29 -0700
Subject: Revert "x86: don't compile vsmp_64 for 32bit"

Partial revert of commit 129d8bc828e011bda0b7110a097bf3a0167f966e
titled 'x86: don't compile vsmp_64 for 32bit'

Commit reverted to compile vsmp_64.c if CONFIG_X86_64 is defined,
since is_vsmp_box() needs to indicate that TSCs are not synchronized, and
hence, not a valid time source, even when CONFIG_X86_VSMP is not defined.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: shai@scalex86.org
LKML-Reference: <20090324061429.GH7278@localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile  |  2 +-
 arch/x86/kernel/vsmp_64.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 339ce35648e..6e9c1f320ac 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -70,7 +70,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
-obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module_$(BITS).o
 obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -120,4 +119,5 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
+	obj-y				+= vsmp_64.o
 endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 74de562812c..a1d804bcd48 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -22,7 +22,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 
-#ifdef CONFIG_PARAVIRT
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
 /*
  * Interrupt control on vSMPowered systems:
  * ~AC is a shadow of IF.  If IF is 'on' AC should be 'off'
@@ -114,6 +114,7 @@ static void __init set_vsmp_pv_ops(void)
 }
 #endif
 
+#ifdef CONFIG_PCI
 static int is_vsmp = -1;
 
 static void __init detect_vsmp_box(void)
@@ -139,6 +140,15 @@ int is_vsmp_box(void)
 	}
 }
 
+#else
+static void __init detect_vsmp_box(void)
+{
+}
+int is_vsmp_box(void)
+{
+	return 0;
+}
+#endif
 void __init vsmp_init(void)
 {
 	detect_vsmp_box();
-- 
cgit v1.2.3