From f52111b1546943545e67573c4dde1c7613ca33d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 18:19:16 -0400
Subject: [PATCH] take init_files to fs/file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kernel/init_task.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3d01e47777d..a4f93b4120c 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -11,7 +11,6 @@
 #include <asm/desc.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
-- 
cgit v1.2.3


From 107d6d2efa9eb8c48d050936d8019230ac6b24cd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 5 May 2008 14:58:26 +0300
Subject: KVM: x86 emulator: fix writes to registers with modrm encodings

A register destination encoded with a mod=3 encoding left dst.ptr NULL.
Normally we don't trap writes to registers, but in the case of smsw, we do.

Fix by pointing dst.ptr at the destination register.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2a696d6a24..8a96320ab07 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -677,8 +677,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->use_modrm_ea = 1;
 
 	if (c->modrm_mod == 3) {
-		c->modrm_val = *(unsigned long *)
-			decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+		c->modrm_ptr = decode_register(c->modrm_rm,
+					       c->regs, c->d & ByteOp);
+		c->modrm_val = *(unsigned long *)c->modrm_ptr;
 		return rc;
 	}
 
@@ -1005,6 +1006,7 @@ done_prefixes:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->src.type = OP_REG;
 			c->src.val = c->modrm_val;
+			c->src.ptr = c->modrm_ptr;
 			break;
 		}
 		c->src.type = OP_MEM;
@@ -1049,6 +1051,7 @@ done_prefixes:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->dst.type = OP_REG;
 			c->dst.val = c->dst.orig_val = c->modrm_val;
+			c->dst.ptr = c->modrm_ptr;
 			break;
 		}
 		c->dst.type = OP_MEM;
-- 
cgit v1.2.3


From eedaa4e2af681a266c084c410238855bdfbc2787 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 6 May 2008 13:32:54 -0300
Subject: KVM: PIT: take inject_pending into account when emulating hlt

Otherwise hlt emulation fails if PIT is not injecting IRQ's.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3324d90038e..7c077a9d977 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -216,7 +216,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-	if (pit && vcpu->vcpu_id == 0)
+	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
 		return atomic_read(&pit->pit_state.pit_timer.pending);
 
 	return 0;
-- 
cgit v1.2.3


From 54aaacee35afd594bba3244c20b02cc98d80a961 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 14 May 2008 02:29:06 -0300
Subject: KVM: LAPIC: ignore pending timers if LVTT is disabled

Only use the APIC pending timers count to break out of HLT emulation if
the timer vector is enabled.

Certain configurations of Windows simply mask out the vector without
disabling the timer.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36809d79788..c297c50eba6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -957,7 +957,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *lapic = vcpu->arch.apic;
 
-	if (lapic)
+	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
 		return atomic_read(&lapic->timer.pending);
 
 	return 0;
-- 
cgit v1.2.3


From 8d64c781f0c5fbfdf8016bd1634506ff2ad1376a Mon Sep 17 00:00:00 2001
From: Tony Camuso <tcamuso@redhat.com>
Date: Thu, 15 May 2008 14:40:14 -0400
Subject: PCI: Correct last two HP entries in the bfsort whitelist

Replace Redundant Whitelist Entries with the Correct Ones

The ProLiant DL585 G2 and the DL585 G2 are entered reundantly in the
dmi_system_id table. What should have been there are the DL360 and DL380. This
patch simply replaces the redundant entries with the correct entries.

Signed-off-by: Tony Camuso <tony.camuso@hp.com>
Signed-off-by: Pat Schoeller <patrick.schoeller@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8545c8a9d10..6e64aaf00d1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -302,18 +302,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 	},
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL385 G2",
+		.ident = "HP ProLiant DL360",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
 		},
 	},
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL585 G2",
+		.ident = "HP ProLiant DL380",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
 		},
 	},
 #ifdef __i386__
-- 
cgit v1.2.3


From eba9fe93a2959ec7f195c47c9db6ce7b5114ce1f Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Tue, 18 Mar 2008 15:24:32 -0500
Subject: [CPUFREQ] powernow-k8: improve error messages

The most common error with powernow-k8 is an ACPI _PSS error
caused either by failure to load the ACPI processor module
or a bad parse of the _PSS object.  Make the error message
returned to the user in these situations more straightforward
and easier to understand.

-Mark Langsdorf
Operating System Research Center
AMD

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 46d4034d9f3..206791eb46e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1127,12 +1127,23 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		 * an UP version, and is deprecated by AMD.
 		 */
 		if (num_online_cpus() != 1) {
-			printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n");
+#ifndef CONFIG_ACPI_PROCESSOR
+			printk(KERN_ERR PFX "ACPI Processor support is required "
+			       "for SMP systems but is absent. Please load the "
+			       "ACPI Processor module before starting this "
+			       "driver.\n");
+#else
+			printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
+			       "_PSS objects in a way that Linux understands. "
+			       "Please report this to the Linux ACPI maintainers"
+			       " and complain to your BIOS vendor.\n");
+#endif
 			kfree(data);
 			return -ENODEV;
 		}
 		if (pol->cpu != 0) {
-			printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n");
+			printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
+			       "CPU0. Complain to your BIOS vendor.\n");
 			kfree(data);
 			return -ENODEV;
 		}
-- 
cgit v1.2.3


From 667ad4f70110357e8f024e81741c7bd1d7906e7d Mon Sep 17 00:00:00 2001
From: maximilian attems <max@stro.at>
Date: Thu, 8 May 2008 22:10:01 +0200
Subject: [CPUFREQ] Crusoe: longrun cpufreq module reports false min freq

The longrun cpufreq module reports a false minimum frequency 3MHz on
300-600MHz Crusoe processor.  This may be due to a calculation bug
in the module.

Original patch from Kaz Sasayama <kazssym@hypercore.co.jp>
submitted as http://bugs.debian.org/468149 patch ported to x86

Cc: Kaz Sasayama <kazssym@hypercore.co.jp>
Signed-off-by: maximilian attems <max@stro.at>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longrun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index af4a867a097..777a7ff075d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -245,7 +245,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
 	if ((ecx > 95) || (ecx == 0) || (eax < ebx))
 		return -EIO;
 
-	edx = (eax - ebx) / (100 - ecx);
+	edx = ((eax - ebx) * 100) / (100 - ecx);
 	*low_freq = edx * 1000; /* back to kHz */
 
 	dprintk("low frequency is %u kHz\n", *low_freq);
-- 
cgit v1.2.3


From a1676072558854b95336c8f7db76b0504e909a0a Mon Sep 17 00:00:00 2001
From: Tony Camuso <tcamuso@redhat.com>
Date: Thu, 15 May 2008 14:40:14 -0400
Subject: PCI: Correct last two HP entries in the bfsort whitelist

Greetings.

There is a code flaw in the bfsort whitelist, where there are redundant
entries for the same two HP systems, DL385 G2 and DL585 G2. This patch
replaces those redundant entries with the correct ones. The correct
entries are for large-volume systems, the DL360 and DL380.

-----------------------------------------------------------------------

commit ec69f0374c3b0ad7ea991b0e9ac00377acfe5b1a
Author: Tony Camuso <tony.camuso@hp.com>
Date:   Wed May 14 07:09:28 2008 -0400

     Replace Redundant Whitelist Entries with the Correct Ones

     The ProLiant DL585 G2 and the DL585 G2 are entered reundantly
     in the dmi_system_id table. What should have been there are the
     DL360 and DL380. This patch simply replaces the redundant
     entries with the correct entries.

 arch/x86/pci/common.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

     Signed-off-by: Tony Camuso <tony.camuso@hp.com>
     Signed-off-by: Pat Schoeller <patrick.schoeller@hp.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/pci/common.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 6e64aaf00d1..940185ecaed 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -328,18 +328,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 #endif
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL385 G2",
+		.ident = "HP ProLiant DL360",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
 		},
 	},
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL585 G2",
+		.ident = "HP ProLiant DL380",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
 		},
 	},
 	{}
-- 
cgit v1.2.3


From b6db80ee1331e7beaeb91b4b3d946dd16c72e388 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 18 May 2008 19:27:48 +0200
Subject: x86: fix setup of cyc2ns in tsc_64.c

When the TSC is calibrated against the PIT due to the nonavailability
of PMTIMER/HPET or due to SMI interference then the setup of the per
CPU cyc2ns variables is skipped. This is unlikely to happen but it
would definitely render sched_clock() unusable.

This was introduced with commit 53d517cdbaac704352b3d0c10fecb99e0b54572e

    x86: scale cyc_2_nsec according to CPU frequency

Update the per CPU cyc2ns variables in all exit pathes of tsc_calibrate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 arch/x86/kernel/tsc_64.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index fcc16e58609..1784b8077a1 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -227,14 +227,14 @@ void __init tsc_calibrate(void)
 	/* hpet or pmtimer available ? */
 	if (!hpet && !pm1 && !pm2) {
 		printk(KERN_INFO "TSC calibrated against PIT\n");
-		return;
+		goto out;
 	}
 
 	/* Check, whether the sampling was disturbed by an SMI */
 	if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
 		printk(KERN_WARNING "TSC calibration disturbed by SMI, "
 		       "using PIT calibration result\n");
-		return;
+		goto out;
 	}
 
 	tsc2 = (tsc2 - tsc1) * 1000000L;
@@ -255,6 +255,7 @@ void __init tsc_calibrate(void)
 
 	tsc_khz = tsc2 / tsc1;
 
+out:
 	for_each_possible_cpu(cpu)
 		set_cyc2ns_scale(tsc_khz, cpu);
 }
-- 
cgit v1.2.3


From 9ccc906c97e34fd91dc6aaf5b69b52d824386910 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 May 2008 12:31:00 +0200
Subject: x86: distangle user disabled TSC from unstable

tsc_enabled is set to 0 from the command line switch "notsc" and from
the mark_tsc_unstable code. Seperate those functionalities and replace
tsc_enable with tsc_disable. This makes also the native_sched_clock()
decision when to use TSC understandable.

Preparatory patch to solve the sched_clock() issue on 32 bit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_32.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index e4790728b22..b087d691f16 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,7 @@
 
 #include "mach_timer.h"
 
-static int tsc_enabled;
+static int tsc_disabled;
 
 /*
  * On some systems the TSC frequency does not
@@ -28,8 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
 static int __init tsc_setup(char *str)
 {
 	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC completely.\n");
-	mark_tsc_unstable("user disabled TSC");
+	       "cannot disable TSC completely.\n");
+	tsc_disabled = 1;
 	return 1;
 }
 #else
@@ -120,7 +120,7 @@ unsigned long long native_sched_clock(void)
 	 *   very important for it to be as fast as the platform
 	 *   can achive it. )
 	 */
-	if (unlikely(!tsc_enabled && !tsc_unstable))
+	if (unlikely(tsc_disabled))
 		/* No locking but a rare wrong value is not a big deal: */
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
@@ -322,7 +322,6 @@ void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
-		tsc_enabled = 0;
 		printk("Marking TSC unstable due to: %s.\n", reason);
 		/* Can be called before registration */
 		if (clocksource_tsc.mult)
@@ -336,7 +335,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
 {
 	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-		       d->ident);
+	       d->ident);
 	tsc_unstable = 1;
 	return 0;
 }
@@ -403,8 +402,11 @@ void __init tsc_init(void)
 {
 	int cpu;
 
-	if (!cpu_has_tsc)
+	if (!cpu_has_tsc || tsc_disabled) {
+		/* Disable the TSC in case of !cpu_has_tsc */
+		tsc_disabled = 1;
 		return;
+	}
 
 	cpu_khz = calculate_cpu_khz();
 	tsc_khz = cpu_khz;
@@ -441,8 +443,6 @@ void __init tsc_init(void)
 	if (check_tsc_unstable()) {
 		clocksource_tsc.rating = 0;
 		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
-	} else
-		tsc_enabled = 1;
-
+	}
 	clocksource_register(&clocksource_tsc);
 }
-- 
cgit v1.2.3


From 74dc51a3de06aa516e3b9fdc4017b2aeb38bf44b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 18 May 2008 22:17:59 +0200
Subject: x86: disable TSC for sched_clock() when calibration failed

When the TSC calibration fails then TSC is still used in
sched_clock(). Disable it completely in that case.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 arch/x86/kernel/tsc_32.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index b087d691f16..068759db63d 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -413,6 +413,11 @@ void __init tsc_init(void)
 
 	if (!cpu_khz) {
 		mark_tsc_unstable("could not calculate TSC khz");
+		/*
+		 * We need to disable the TSC completely in this case
+		 * to prevent sched_clock() from using it.
+		 */
+		tsc_disabled = 1;
 		return;
 	}
 
-- 
cgit v1.2.3


From 2584a82deed7196f48066f1b1a7fad4ec5bea961 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 20 May 2008 18:18:12 -0400
Subject: x86: don't read maxlvt before checking if APIC is mapped

A check for unmapped apic was added before reading maxlvt but the early
read of maxlvt wasn't removed.

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 arch/x86/kernel/apic_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5910020c3f2..0633cfd0dc2 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -534,7 +534,7 @@ int setup_profiling_timer(unsigned int multiplier)
  */
 void clear_local_APIC(void)
 {
-	int maxlvt = lapic_get_maxlvt();
+	int maxlvt;
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-- 
cgit v1.2.3


From de067814d6b69030d0030e1c5b3dbaf0385aae41 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 15 May 2008 13:24:52 +0100
Subject: x86/xen: fix arbitrary_virt_to_machine()

While I realize that the function isn't currently being used, I still
think an obvious mistake like this should be corrected.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 126766d43ae..3525ef523a7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -60,7 +60,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address)
 {
 	unsigned int level;
 	pte_t *pte = lookup_address(address, &level);
-	unsigned offset = address & PAGE_MASK;
+	unsigned offset = address & ~PAGE_MASK;
 
 	BUG_ON(pte == NULL);
 
-- 
cgit v1.2.3


From 2ddfd20e7c55421435cbf95a5ed3dd6e423cf934 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 22 May 2008 10:37:48 +0200
Subject: namespacecheck: automated fixes

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kvmclock.c | 4 ++--
 arch/x86/kvm/mmu.c         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4bc1be5d547..08a30986d47 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -53,7 +53,7 @@ static cycle_t kvm_clock_read(void);
  * have elapsed since the hypervisor wrote the data. So we try to account for
  * that with system time
  */
-unsigned long kvm_get_wallclock(void)
+static unsigned long kvm_get_wallclock(void)
 {
 	u32 wc_sec, wc_nsec;
 	u64 delta;
@@ -86,7 +86,7 @@ unsigned long kvm_get_wallclock(void)
 	return ts.tv_sec + 1;
 }
 
-int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(unsigned long now)
 {
 	return 0;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c5406b181..7246b60afb9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1996,7 +1996,7 @@ static struct shrinker mmu_shrinker = {
 	.seeks = DEFAULT_SEEKS * 10,
 };
 
-void mmu_destroy_caches(void)
+static void mmu_destroy_caches(void)
 {
 	if (pte_chain_cache)
 		kmem_cache_destroy(pte_chain_cache);
-- 
cgit v1.2.3


From a1289643adb6272c04db9399653ae195072c482a Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 14 May 2008 16:10:42 -0700
Subject: x86: use explicit copy in vdso_gettimeofday()

Jeremy's gcc 3.4 seems to be unable to inline a 8 byte memcpy.  But the
vdso doesn't support external references.  Copy the structure members
of struct timezone explicitely instead.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/vdso/vclock_gettime.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 23476c2ebfc..efa2ba7c600 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -106,9 +106,9 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 		do_realtime((struct timespec *)tv);
 		tv->tv_usec /= 1000;
 		if (unlikely(tz != NULL)) {
-			/* This relies on gcc inlining the memcpy. We'll notice
-			   if it ever fails to do so. */
-			memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+			/* Avoid memcpy. Some old compilers fail to inline it */
+			tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
+			tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
 		}
 		return 0;
 	}
-- 
cgit v1.2.3


From 7fafd91d85181e946207bed18c44addc47e36c63 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 22 May 2008 15:45:06 -0700
Subject: x86: fix integer as NULL pointer warning

arch/x86/boot/printf.c:59:10: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/boot/printf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index c1d00c0274c..50e47cdbddd 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -56,7 +56,7 @@ static char *number(char *str, long num, int base, int size, int precision,
 	if (type & LEFT)
 		type &= ~ZEROPAD;
 	if (base < 2 || base > 36)
-		return 0;
+		return NULL;
 	c = (type & ZEROPAD) ? '0' : ' ';
 	sign = 0;
 	if (type & SIGN) {
-- 
cgit v1.2.3


From 73531905ed53576d9e8707659a761e7046a60497 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 25 May 2008 23:03:18 +0200
Subject: Kconfig: introduce ARCH_DEFCONFIG to DEFCONFIG_LIST

init/Kconfig contains a list of configs that are searched
for if 'make *config' are used with no .config present.
Extend this list to look at the config identified by
ARCH_DEFCONFIG.

With this change we now try the defconfig targets last.

This fixes a regression reported
by: Linus Torvalds <torvalds@linux-foundation.org>

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2..dcbec34154c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,17 +26,10 @@ config X86
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
-config DEFCONFIG_LIST
+config ARCH_DEFCONFIG
 	string
-	depends on X86_32
-	option defconfig_list
-	default "arch/x86/configs/i386_defconfig"
-
-config DEFCONFIG_LIST
-	string
-	depends on X86_64
-	option defconfig_list
-	default "arch/x86/configs/x86_64_defconfig"
+	default "arch/x86/configs/i386_defconfig" if X86_32
+	default "arch/x86/configs/x86_64_defconfig" if X86_64
 
 
 config GENERIC_LOCKBREAK
-- 
cgit v1.2.3


From a16ffe93c46dfca211434d00453ebb695025978b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 30 May 2008 15:09:42 -0500
Subject: lguest: fix ugly <NULL> in /proc/interrupts

Before:
	root@ubuntu:~# cat /proc/interrupts
	           CPU0
	  1:       1672    lguest-<NULL>    virtio0
	  2:          1    lguest-<NULL>    virtio1
	  ...
After:
	root@ubuntu:~# cat /proc/interrupts
	           CPU0
	  1:       2889    lguest-level     virtio0
	  2:          9    lguest-level     virtio1

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index af65b2da3ba..5c7e2fd5207 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -582,8 +582,9 @@ static void __init lguest_init_IRQ(void)
 		int vector = FIRST_EXTERNAL_VECTOR + i;
 		if (vector != SYSCALL_VECTOR) {
 			set_intr_gate(vector, interrupt[i]);
-			set_irq_chip_and_handler(i, &lguest_irq_controller,
-						 handle_level_irq);
+			set_irq_chip_and_handler_name(i, &lguest_irq_controller,
+						      handle_level_irq,
+						      "level");
 		}
 	}
 	/* This call is required to set up for 4k stacks, where we have
-- 
cgit v1.2.3


From 75b19b790bec3ebffbf513405b27500e22270cbc Mon Sep 17 00:00:00 2001
From: Bertram Felgenhauer <int-e@gmx.de>
Date: Fri, 30 May 2008 03:20:05 +0200
Subject: pci, x86: add workaround for bug in ASUS A7V600 BIOS (rev 1005)

This BIOS claims the VIA 8237 south bridge to be compatible with VIA 586,
which it is not.

Without this patch, I get the following warning while booting,
among others,

| PCI: Using IRQ router VIA [1106/3227] at 0000:00:11.0
| ------------[ cut here ]------------
| WARNING: at arch/x86/pci/irq.c:265 pirq_via586_get+0x4a/0x60()
| Modules linked in:
| Pid: 1, comm: swapper Not tainted 2.6.26-rc4-00015-g1ec7d99 #1
|  [<c0119fd4>] warn_on_slowpath+0x54/0x70
|  [<c02246e0>] ? vt_console_print+0x210/0x2b0
|  [<c02244d0>] ? vt_console_print+0x0/0x2b0
|  [<c011a413>] ? __call_console_drivers+0x43/0x60
|  [<c011a482>] ? _call_console_drivers+0x52/0x80
|  [<c011aa89>] ? release_console_sem+0x1c9/0x200
|  [<c0291d21>] ? raw_pci_read+0x41/0x70
|  [<c0291e8f>] ? pci_read+0x2f/0x40
|  [<c029151a>] pirq_via586_get+0x4a/0x60
|  [<c02914d0>] ? pirq_via586_get+0x0/0x60
|  [<c029178d>] pcibios_lookup_irq+0x15d/0x430
|  [<c03b895a>] pcibios_irq_init+0x17a/0x3e0
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c03a6763>] kernel_init+0x73/0x250
|  [<c03b87e0>] ? pcibios_irq_init+0x0/0x3e0
|  [<c0114d00>] ? schedule_tail+0x10/0x40
|  [<c0102dee>] ? ret_from_fork+0x6/0x1c
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c010324b>] kernel_thread_helper+0x7/0x1c
|  =======================
| ---[ end trace 4eaa2a86a8e2da22 ]---

and IRQ trouble later,

| irq 10: nobody cared (try booting with the "irqpoll" option)

Now that's an VIA 8237 chip, so pirq_via586_get shouldn't be called
at all; adding this workaround to via_router_probe() fixes the
problem for me.

Amazingly I have a 2.6.23.8 kernel that somehow works fine ... I'll
never understand why.

Signed-off-by: Bertram Felgenhauer <int-e@gmx.de>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/pci/irq.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0908fca901b..ca8df9c260b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r,
 			 */
 			device = PCI_DEVICE_ID_VIA_8235;
 			break;
+		case PCI_DEVICE_ID_VIA_8237:
+			/**
+			 * Asus a7v600 bios wrongly reports 8237
+			 * as 586-compatible
+			 */
+			device = PCI_DEVICE_ID_VIA_8237;
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From db9f600b96c16bb3c7f094e294fbdd370226ad86 Mon Sep 17 00:00:00 2001
From: Miquel van Smoorenburg <mikevs@xs4all.net>
Date: Wed, 28 May 2008 10:31:25 +0200
Subject: x86: pci-dma.c: use __GFP_NO_OOM instead of __GFP_NORETRY

On Wed, 2008-05-28 at 04:47 +0200, Andi Kleen wrote:
> > So...  why not just remove the setting of __GFP_NORETRY?  Why is it
> > wrong to oom-kill things in this case?
>
> When the 16MB zone overflows (which can be common in some workloads)
> calling the OOM killer is pretty useless because it has barely any
> real user data [only exception would be the "only 16MB" case Alan
> mentioned]. Killing random processes in this case is bad.
>
> I think for 16MB __GFP_NORETRY is ok because there should be
> nothing freeable in there so looping is useless. Only exception would be the
> "only 16MB total" case again but I'm not sure 2.6 supports that at all
> on x86.
>
> On the other hand d_a_c() does more allocations than just 16MB, especially
> on 64bit and the other zones need different strategies.

Okay, so how about this then ?

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-dma.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c5ef1af8e79..069e843f0b9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -397,9 +397,6 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	if (dev->dma_mask == NULL)
 		return NULL;
 
-	/* Don't invoke OOM killer */
-	gfp |= __GFP_NORETRY;
-
 #ifdef CONFIG_X86_64
 	/* Why <=? Even when the mask is smaller than 4GB it is often
 	   larger than 16MB and in this case we have a chance of
@@ -410,7 +407,9 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 #endif
 
  again:
-	page = dma_alloc_pages(dev, gfp, get_order(size));
+	/* Don't invoke OOM killer or retry in lower 16MB DMA zone */
+	page = dma_alloc_pages(dev,
+		(gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size));
 	if (page == NULL)
 		return NULL;
 
-- 
cgit v1.2.3


From f529626a86d61897862aa1bbbb4537773209238e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 29 May 2008 00:30:21 -0700
Subject: suspend-vs-iommu: prevent suspend if we could not resume

iommu/gart support misses suspend/resume code, which can do bad stuff,
including memory corruption on resume.  Prevent system suspend in case we
would be unable to resume.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Tested-by: Patrick <ragamuffin@datacomm.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c07455d1695..aa8ec928caa 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
+#include <linux/sysdev.h>
 #include <asm/atomic.h>
 #include <asm/io.h>
 #include <asm/mtrr.h>
@@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	return aper_base;
 }
 
+static int gart_resume(struct sys_device *dev)
+{
+	return 0;
+}
+
+static int gart_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return -EINVAL;
+}
+
+static struct sysdev_class gart_sysdev_class = {
+	.name = "gart",
+	.suspend = gart_suspend,
+	.resume = gart_resume,
+
+};
+
+static struct sys_device device_gart = {
+	.id	= 0,
+	.cls	= &gart_sysdev_class,
+};
+
 /*
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
@@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	unsigned aper_base, new_aper_base;
 	struct pci_dev *dev;
 	void *gatt;
-	int i;
+	int i, error;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
@@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 		pci_write_config_dword(dev, 0x90, ctl);
 	}
+
+	error = sysdev_class_register(&gart_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_gart);
+	if (error)
+		panic("Could not register gart_sysdev -- would corrupt data on next suspend");
 	flush_gart();
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
-- 
cgit v1.2.3


From fb3bbd6a663fe972611676381adc4c60ddfe61ac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 22 May 2008 18:22:30 -0700
Subject: x86: fix APIC warning on 32bit v2

for http://bugzilla.kernel.org/show_bug.cgi?id=10613

BIOS bug, APIC version is 0 for CPU#0! fixing up to 0x10. (tell your hw vendor)

v2: fix 64 bit compilation

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Gabriel C <nix.or.die@googlemail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/boot.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c49ebcc6c41..33c5216fd3e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -242,12 +242,19 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
 
 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
+	unsigned int ver = 0;
+
 	if (!enabled) {
 		++disabled_cpus;
 		return;
 	}
 
-	generic_processor_info(id, 0);
+#ifdef CONFIG_X86_32
+	if (boot_cpu_physical_apicid != -1U)
+		ver = apic_version[boot_cpu_physical_apicid];
+#endif
+
+	generic_processor_info(id, ver);
 }
 
 static int __init
@@ -767,8 +774,13 @@ static void __init acpi_register_lapic_address(unsigned long address)
 	mp_lapic_addr = address;
 
 	set_fixmap_nocache(FIX_APIC_BASE, address);
-	if (boot_cpu_physical_apicid == -1U)
+	if (boot_cpu_physical_apicid == -1U) {
 		boot_cpu_physical_apicid  = GET_APIC_ID(read_apic_id());
+#ifdef CONFIG_X86_32
+		apic_version[boot_cpu_physical_apicid] =
+			 GET_APIC_VERSION(apic_read(APIC_LVR));
+#endif
+	}
 }
 
 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
-- 
cgit v1.2.3


From deef325086c3897393b8f7d6bccd03405244fe18 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 15:44:38 +0200
Subject: x86: disable preemption in native_smp_prepare_cpus

Priit Laes reported the following warning:

Call Trace:
 [<ffffffff8022f1e1>] warn_on_slowpath+0x51/0x63
 [<ffffffff80282e48>] sys_ioctl+0x2d/0x5d
 [<ffffffff805185ff>] _spin_lock+0xe/0x24
 [<ffffffff80227459>] task_rq_lock+0x3d/0x73
 [<ffffffff805133c3>] set_cpu_sibling_map+0x336/0x350
 [<ffffffff8021c1b8>] read_apic_id+0x30/0x62
 [<ffffffff806d921d>] verify_local_APIC+0x90/0x138
 [<ffffffff806d84b5>] native_smp_prepare_cpus+0x1f9/0x305
 [<ffffffff806ce7b1>] kernel_init+0x59/0x2d9
 [<ffffffff80518a26>] _spin_unlock_irq+0x11/0x2b
 [<ffffffff8020bf48>] child_rip+0xa/0x12
 [<ffffffff806ce758>] kernel_init+0x0/0x2d9
 [<ffffffff8020bf3e>] child_rip+0x0/0x12

fix this by generally disabling preemption in native_smp_prepare_cpus().

Reported-and-bisected-by: Priit Laes <plaes@plaes.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 38988491c62..56078d61c79 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1190,6 +1190,7 @@ static void __init smp_cpu_index_default(void)
  */
 void __init native_smp_prepare_cpus(unsigned int max_cpus)
 {
+	preempt_disable();
 	nmi_watchdog_default();
 	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
@@ -1206,7 +1207,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
 		disable_smp();
-		return;
+		goto out;
 	}
 
 	preempt_disable();
@@ -1246,6 +1247,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	printk(KERN_INFO "CPU%d: ", 0);
 	print_cpu_info(&cpu_data(0));
 	setup_boot_clock();
+out:
+	preempt_enable();
 }
 /*
  * Early setup to make printk work.
-- 
cgit v1.2.3


From 5c1ea08215f1f830dfaf4819a5f22efca41c3832 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 25 May 2008 11:13:32 -0400
Subject: x86: enable preemption in delay

The RT team has been searching for a nasty latency. This latency shows
up out of the blue and has been seen to be as big as 5ms!

Using ftrace I found the cause of the latency.

   pcscd-2995  3dNh1 52360300us : irq_exit (smp_apic_timer_interrupt)
   pcscd-2995  3dN.2 52360301us : idle_cpu (irq_exit)
   pcscd-2995  3dN.2 52360301us : rcu_irq_exit (irq_exit)
   pcscd-2995  3dN.1 52360771us : smp_apic_timer_interrupt (apic_timer_interrupt
)
   pcscd-2995  3dN.1 52360771us : exit_idle (smp_apic_timer_interrupt)

Here's an example of a 400 us latency. pcscd took a timer interrupt and
returned with "need resched" enabled, but did not reschedule until after
the next interrupt came in at 52360771us 400us later!

At first I thought we somehow missed a preemption check in entry.S. But
I also noticed that this always seemed to happen during a __delay call.

   pcscd-2995  3dN.2 52360836us : rcu_irq_exit (irq_exit)
   pcscd-2995  3.N.. 52361265us : preempt_schedule (__delay)

Looking at the x86 delay, I found my problem.

In git commit 35d5d08a085c56f153458c3f5d8ce24123617faf, Andrew Morton
placed preempt_disable around the entire delay due to TSC's not working
nicely on SMP.  Unfortunately for those that care about latencies this
is devastating! Especially when we have callers to mdelay(8).

Here I enable preemption during the loop and account for anytime the task
migrates to a new CPU. The delay asked for may be extended a bit by
the migration, but delay only guarantees that it will delay for that minimum
time. Delaying longer should not be an issue.

[
  Thanks to Thomas Gleixner for spotting that cpu wasn't updated,
    and to place the rep_nop between preempt_enabled/disable.
]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: akpm@osdl.org
Cc: Clark Williams <clark.williams@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi-suse@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/lib/delay_32.c | 31 +++++++++++++++++++++++++++----
 arch/x86/lib/delay_64.c | 30 ++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
index 4535e6d147a..d710f2d167b 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay_32.c
@@ -44,13 +44,36 @@ static void delay_loop(unsigned long loops)
 static void delay_tsc(unsigned long loops)
 {
 	unsigned long bclock, now;
+	int cpu;
 
-	preempt_disable();		/* TSC's are per-cpu */
+	preempt_disable();
+	cpu = smp_processor_id();
 	rdtscl(bclock);
-	do {
-		rep_nop();
+	for (;;) {
 		rdtscl(now);
-	} while ((now-bclock) < loops);
+		if ((now - bclock) >= loops)
+			break;
+
+		/* Allow RT tasks to run */
+		preempt_enable();
+		rep_nop();
+		preempt_disable();
+
+		/*
+		 * It is possible that we moved to another CPU, and
+		 * since TSC's are per-cpu we need to calculate
+		 * that. The delay must guarantee that we wait "at
+		 * least" the amount of time. Being moved to another
+		 * CPU could make the wait longer but we just need to
+		 * make sure we waited long enough. Rebalance the
+		 * counter for this CPU.
+		 */
+		if (unlikely(cpu != smp_processor_id())) {
+			loops -= (now - bclock);
+			cpu = smp_processor_id();
+			rdtscl(bclock);
+		}
+	}
 	preempt_enable();
 }
 
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
index bbc61051851..4c441be9264 100644
--- a/arch/x86/lib/delay_64.c
+++ b/arch/x86/lib/delay_64.c
@@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigned long *timer_value)
 void __delay(unsigned long loops)
 {
 	unsigned bclock, now;
+	int cpu;
 
-	preempt_disable();		/* TSC's are pre-cpu */
+	preempt_disable();
+	cpu = smp_processor_id();
 	rdtscl(bclock);
-	do {
-		rep_nop(); 
+	for (;;) {
 		rdtscl(now);
+		if ((now - bclock) >= loops)
+			break;
+
+		/* Allow RT tasks to run */
+		preempt_enable();
+		rep_nop();
+		preempt_disable();
+
+		/*
+		 * It is possible that we moved to another CPU, and
+		 * since TSC's are per-cpu we need to calculate
+		 * that. The delay must guarantee that we wait "at
+		 * least" the amount of time. Being moved to another
+		 * CPU could make the wait longer but we just need to
+		 * make sure we waited long enough. Rebalance the
+		 * counter for this CPU.
+		 */
+		if (unlikely(cpu != smp_processor_id())) {
+			loops -= (now - bclock);
+			cpu = smp_processor_id();
+			rdtscl(bclock);
+		}
 	}
-	while ((now-bclock) < loops);
 	preempt_enable();
 }
 EXPORT_SYMBOL(__delay);
-- 
cgit v1.2.3


From e8a496ac8cd00cabbdaa373db4818a9ad19a1c5a Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 23 May 2008 16:26:37 -0700
Subject: x86: fix broken math-emu with lazy allocation of fpu area

Fix the math emulation that got broken with the recent lazy allocation of FPU
area. init_fpu() need to be added for the math-emulation path aswell
for the FPU area allocation.

math emulation enabled kernel booted fine with this, in the presence
of "no387 nofxsr" boot param.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: hpa@zytor.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i387.c        | 44 ++++++++++++++++++++++++++++---------------
 arch/x86/math-emu/fpu_entry.c | 13 ++++++++-----
 2 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e03cc952f23..eb9ddd8efb8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -56,6 +56,11 @@ void __cpuinit mxcsr_feature_mask_init(void)
 
 void __init init_thread_xstate(void)
 {
+	if (!HAVE_HWFP) {
+		xstate_size = sizeof(struct i387_soft_struct);
+		return;
+	}
+
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -94,7 +99,7 @@ void __cpuinit fpu_init(void)
 int init_fpu(struct task_struct *tsk)
 {
 	if (tsk_used_math(tsk)) {
-		if (tsk == current)
+		if (HAVE_HWFP && tsk == current)
 			unlazy_fpu(tsk);
 		return 0;
 	}
@@ -109,6 +114,15 @@ int init_fpu(struct task_struct *tsk)
 			return -ENOMEM;
 	}
 
+#ifdef CONFIG_X86_32
+	if (!HAVE_HWFP) {
+		memset(tsk->thread.xstate, 0, xstate_size);
+		finit();
+		set_stopped_child_used_math(tsk);
+		return 0;
+	}
+#endif
+
 	if (cpu_has_fxsr) {
 		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 
@@ -330,13 +344,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	if (!HAVE_HWFP)
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
 	ret = init_fpu(target);
 	if (ret)
 		return ret;
 
+	if (!HAVE_HWFP)
+		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 					   &target->thread.xstate->fsave, 0,
@@ -360,15 +374,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct user_i387_ia32_struct env;
 	int ret;
 
-	if (!HAVE_HWFP)
-		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
 	ret = init_fpu(target);
 	if (ret)
 		return ret;
 
 	set_stopped_child_used_math(target);
 
+	if (!HAVE_HWFP)
+		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 					  &target->thread.xstate->fsave, 0, -1);
@@ -474,18 +488,18 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
 int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
 {
 	int err;
+	struct task_struct *tsk = current;
 
-	if (HAVE_HWFP) {
-		struct task_struct *tsk = current;
-
+	if (HAVE_HWFP)
 		clear_fpu(tsk);
 
-		if (!used_math()) {
-			err = init_fpu(tsk);
-			if (err)
-				return err;
-		}
+	if (!used_math()) {
+		err = init_fpu(tsk);
+		if (err)
+			return err;
+	}
 
+	if (HAVE_HWFP) {
 		if (cpu_has_fxsr)
 			err = restore_i387_fxsave(buf);
 		else
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 6e38d877ea7..c7b06feb139 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -30,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/user.h>
+#include <asm/i387.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
@@ -146,6 +147,13 @@ asmlinkage void math_emulate(long arg)
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
 
+	if (!used_math()) {
+		if (init_fpu(current)) {
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
 		printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
@@ -153,11 +161,6 @@ asmlinkage void math_emulate(long arg)
 	RE_ENTRANT_CHECK_ON;
 #endif /* RE_ENTRANT_CHECKING */
 
-	if (!used_math()) {
-		finit();
-		set_used_math();
-	}
-
 	SETUP_DATA_AREA(arg);
 
 	FPU_ORIG_EIP = FPU_EIP;
-- 
cgit v1.2.3


From 226e9a93a253b7d8811b5ed9ac671c6c5a728022 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 May 2008 09:56:49 +0200
Subject: x86: ioremap fix failing nesting check

Mika Kukkonen noticed that the nesting check in early_iounmap() is not
actually done.

Reported-by: Mika Kukkonen <mikukkon@srv1-m700-lanp.koti>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: torvalds@linux-foundation.org
Cc: arjan@linux.intel.com
Cc: mikukkon@iki.fi
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/ioremap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031..2b2bb3f9b68 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -593,10 +593,11 @@ void __init early_iounmap(void *addr, unsigned long size)
 	unsigned long offset;
 	unsigned int nrpages;
 	enum fixed_addresses idx;
-	unsigned int nesting;
+	int nesting;
 
 	nesting = --early_ioremap_nested;
-	WARN_ON(nesting < 0);
+	if (WARN_ON(nesting < 0))
+		return;
 
 	if (early_ioremap_debug) {
 		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
-- 
cgit v1.2.3


From 2884f110d5409714f3a04eeb6d2ecd77da66b242 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 28 May 2008 19:36:07 +0100
Subject: x86: fix bad pmd ffff810000207xxx(9090909090909090)

OGAWA Hirofumi and Fede have reported rare pmd_ERROR messages:
mm/memory.c:127: bad pmd ffff810000207xxx(9090909090909090).

Initialization's cleanup_highmap was leaving alignment filler
behind in the pmd for MODULES_VADDR: when vmalloc's guard page
would occupy a new page table, it's not allocated, and then
module unload's vfree hits the bad 9090 pmd entry left over.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f81..998a06ea5f7 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -206,7 +206,7 @@ void __init cleanup_highmap(void)
 	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
 
 	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
-		if (!pmd_present(*pmd))
+		if (pmd_none(*pmd))
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
 			set_pmd(pmd, __pmd(0));
-- 
cgit v1.2.3


From 511631011d39706ac81ee5e4c9084d61e5b4fd34 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Thu, 29 May 2008 21:14:35 -0300
Subject: x86: fix pointer type warning in arch/x86/mm/init_64.c:early_memtest

Changed the call to find_e820_area_size to pass u64 instead of unsigned long.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 998a06ea5f7..156e6d7b0e3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
 
 static void __init early_memtest(unsigned long start, unsigned long end)
 {
-	unsigned long t_start, t_size;
+	u64 t_start, t_size;
 	unsigned pattern;
 
 	if (!memtest_pattern)
@@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			if (t_start + t_size > end)
 				t_size = end - t_start;
 
-			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
+			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
 				t_start, t_start + t_size, pattern);
 
 			memtest(t_start, t_size, pattern);
-- 
cgit v1.2.3


From 282c454cd3a7041f59a37112bb2f82263bc38f6c Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 29 May 2008 12:01:44 -0700
Subject: x86: fix Xorg crash with xf86MapVidMem error

Clarify the usage of mtrr_lookup() in PAT code, and to make PAT code
resilient to mtrr lookup problems.

Specifically, pat_x_mtrr_type() is restructured to highlight, under what
conditions we look for mtrr hint. pat_x_mtrr_type() uses a default type
when there are any errors in mtrr lookup (still maintaining the pat
consistency). And, reserve_memtype() highlights its usage ot mtrr_lookup
for request type of '-1' and also defaults in a sane way on any mtrr
lookup failure.

pat.c looks at mtrr type of a range to get a hint on what mapping type
to request when user/API: (1) hasn't specified any type (/dev/mem
mapping) and we do not want to take performance hit by always mapping
UC_MINUS. This will be the case for /dev/mem mappings used to map BIOS
area or ACPI region which are WB'able. In this case, as long as MTRR is
not WB, PAT will request UC_MINUS for such mappings.

(2) user/API requests WB mapping while in reality MTRR may have UC or
WC. In this case, PAT can map as WB (without checking MTRR) and still
effective type will be UC or WC. But, a subsequent request to map same
region as UC or WC may fail, as the region will get trackked as WB in
PAT list. Looking at MTRR hint helps us to track based on effective type
rather than what user requested. Again, here mtrr_lookup is only used as
hint and we fallback to WB mapping (as requested by user) as default.

In both cases, after using the mtrr hint, we still go through the
memtype list to make sure there are no inconsistencies among multiple
users.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Rufus & Azrael <rufus-azrael@numericable.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de3a9981245..183fdd36d3b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -151,32 +151,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
 	unsigned long pat_type;
 	u8 mtrr_type;
 
-	mtrr_type = mtrr_type_lookup(start, end);
-	if (mtrr_type == 0xFF) {		/* MTRR not enabled */
-		*ret_prot = prot;
-		return 0;
-	}
-	if (mtrr_type == 0xFE) {		/* MTRR match error */
-		*ret_prot = _PAGE_CACHE_UC;
-		return -1;
-	}
-	if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
-	    mtrr_type != MTRR_TYPE_WRBACK &&
-	    mtrr_type != MTRR_TYPE_WRCOMB) {	/* MTRR type unhandled */
-		*ret_prot = _PAGE_CACHE_UC;
-		return -1;
-	}
-
 	pat_type = prot & _PAGE_CACHE_MASK;
 	prot &= (~_PAGE_CACHE_MASK);
 
-	/* Currently doing intersection by hand. Optimize it later. */
+	/*
+	 * We return the PAT request directly for types where PAT takes
+	 * precedence with respect to MTRR and for UC_MINUS.
+	 * Consistency checks with other PAT requests is done later
+	 * while going through memtype list.
+	 */
 	if (pat_type == _PAGE_CACHE_WC) {
 		*ret_prot = prot | _PAGE_CACHE_WC;
+		return 0;
 	} else if (pat_type == _PAGE_CACHE_UC_MINUS) {
 		*ret_prot = prot | _PAGE_CACHE_UC_MINUS;
-	} else if (pat_type == _PAGE_CACHE_UC ||
-	           mtrr_type == MTRR_TYPE_UNCACHABLE) {
+		return 0;
+	} else if (pat_type == _PAGE_CACHE_UC) {
+		*ret_prot = prot | _PAGE_CACHE_UC;
+		return 0;
+	}
+
+	/*
+	 * Look for MTRR hint to get the effective type in case where PAT
+	 * request is for WB.
+	 */
+	mtrr_type = mtrr_type_lookup(start, end);
+
+	if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
 		*ret_prot = prot | _PAGE_CACHE_UC;
 	} else if (mtrr_type == MTRR_TYPE_WRCOMB) {
 		*ret_prot = prot | _PAGE_CACHE_WC;
@@ -233,14 +234,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	if (req_type == -1) {
 		/*
-		 * Special case where caller wants to inherit from mtrr or
-		 * existing pat mapping, defaulting to UC_MINUS in case of
-		 * no match.
+		 * Call mtrr_lookup to get the type hint. This is an
+		 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+		 * tools and ACPI tools). Use WB request for WB memory and use
+		 * UC_MINUS otherwise.
 		 */
 		u8 mtrr_type = mtrr_type_lookup(start, end);
-		if (mtrr_type == 0xFE) { /* MTRR match error */
-			err = -1;
-		}
 
 		if (mtrr_type == MTRR_TYPE_WRBACK) {
 			req_type = _PAGE_CACHE_WB;
-- 
cgit v1.2.3


From be524fb96081e9e511d993ebf39b05a32b19476e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 29 May 2008 00:01:28 -0700
Subject: x86: section mismatch fix

Fix this:

 WARNING: vmlinux.o(.text+0x114bb): Section mismatch in reference from
 the function nopat() to the function .cpuinit.text:pat_disable()
 The function nopat() references
 the function __cpuinit pat_disable().
 This is often because nopat lacks a __cpuinit
 annotation or the annotation of pat_disable is wrong.

Reported-by: "Fabio Comolli" <fabio.comolli@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 183fdd36d3b..06b7a1c90fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -34,7 +34,7 @@ void __cpuinit pat_disable(char *reason)
 	printk(KERN_INFO "%s\n", reason);
 }
 
-static int nopat(char *str)
+static int __init nopat(char *str)
 {
 	pat_disable("PAT support disabled.");
 	return 0;
-- 
cgit v1.2.3


From cd76374e9de4501acc74f833dc6cb5e7a5dca115 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 29 May 2008 00:30:21 -0700
Subject: suspend-vs-iommu: prevent suspend if we could not resume

iommu/gart support misses suspend/resume code, which can do bad stuff,
including memory corruption on resume.  Prevent system suspend in case we
would be unable to resume.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Tested-by: Patrick <ragamuffin@datacomm.ch>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c07455d1695..aa8ec928caa 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
+#include <linux/sysdev.h>
 #include <asm/atomic.h>
 #include <asm/io.h>
 #include <asm/mtrr.h>
@@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	return aper_base;
 }
 
+static int gart_resume(struct sys_device *dev)
+{
+	return 0;
+}
+
+static int gart_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return -EINVAL;
+}
+
+static struct sysdev_class gart_sysdev_class = {
+	.name = "gart",
+	.suspend = gart_suspend,
+	.resume = gart_resume,
+
+};
+
+static struct sys_device device_gart = {
+	.id	= 0,
+	.cls	= &gart_sysdev_class,
+};
+
 /*
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
@@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	unsigned aper_base, new_aper_base;
 	struct pci_dev *dev;
 	void *gatt;
-	int i;
+	int i, error;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
@@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 		pci_write_config_dword(dev, 0x90, ctl);
 	}
+
+	error = sysdev_class_register(&gart_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_gart);
+	if (error)
+		panic("Could not register gart_sysdev -- would corrupt data on next suspend");
 	flush_gart();
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
-- 
cgit v1.2.3


From 870568b39064cab2dd971fe57969916036982862 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 2 Jun 2008 15:57:27 -0700
Subject: x86, fpu: fix CONFIG_PREEMPT=y corruption of application's FPU stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jürgen Mell reported an FPU state corruption bug under CONFIG_PREEMPT,
and bisected it to commit v2.6.19-1363-gacc2076, "i386: add sleazy FPU
optimization".

Add tsk_used_math() checks to prevent calling math_state_restore()
which can sleep in the case of !tsk_used_math(). This prevents
making a blocking call in __switch_to().

Apparently "fpu_counter > 5" check is not enough, as in some signal handling
and fork/exec scenarios, fpu_counter > 5 and !tsk_used_math() is possible.

It's a side effect though. This is the failing scenario:

process 'A' in save_i387_ia32() just after clear_used_math()

Got an interrupt and pre-empted out.

At the next context switch to process 'A' again, kernel tries to restore
the math state proactively and sees a fpu_counter > 0 and !tsk_used_math()

This results in init_fpu() during the __switch_to()'s math_state_restore()

And resulting in fpu corruption which will be saved/restored
(save_i387_fxsave and restore_i387_fxsave) during the remaining
part of the signal handling after the context switch.

Bisected-by: Jürgen Mell <j.mell@t-online.de>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Jürgen Mell <j.mell@t-online.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 arch/x86/kernel/process_32.c | 5 ++++-
 arch/x86/kernel/process_64.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60..6d5483356e7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -649,8 +649,11 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
+	 *
+	 * tsk_used_math() checks prevent calling math_state_restore(),
+	 * which can sleep in the case of !tsk_used_math()
 	 */
-	if (next_p->fpu_counter > 5)
+	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
 
 	/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988..ac54ff56df8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -658,8 +658,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
+	 *
+	 * tsk_used_math() checks prevent calling math_state_restore(),
+	 * which can sleep in the case of !tsk_used_math()
 	 */
-	if (next_p->fpu_counter>5)
+	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
 	return prev_p;
 }
-- 
cgit v1.2.3


From 16104b5504fa8be130f7f127a5a1c7dd774efc44 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 5 Jun 2008 22:47:13 +0200
Subject: x86: fix CONFIG_NONPROMISC_DEVMEM prompt and help text

Here is an attempt to translate the prompt and help text into something
which is legible and, as a bonus, correct.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig.debug | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ac1e31ba479..18363374d51 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,15 +6,19 @@ config TRACE_IRQFLAGS_SUPPORT
 source "lib/Kconfig.debug"
 
 config NONPROMISC_DEVMEM
-	bool "Disable promiscuous /dev/mem"
+	bool "Filter access to /dev/mem"
 	help
-	  The /dev/mem file by default only allows userspace access to PCI
-	  space and the BIOS code and data regions. This is sufficient for
-	  dosemu and X and all common users of /dev/mem. With this config
-	  option, you allow userspace access to all of memory, including
-	  kernel and userspace memory. Accidental access to this is
-	  obviously disasterous, but specific access can be used by people
-	  debugging the kernel.
+	  If this option is left off, you allow userspace access to all
+	  of memory, including kernel and userspace memory. Accidental
+	  access to this is obviously disastrous, but specific access can
+	  be used by people debugging the kernel.
+
+	  If this option is switched on, the /dev/mem file only allows
+	  userspace access to PCI space and the BIOS code and data regions.
+	  This is sufficient for dosemu and X and all common users of
+	  /dev/mem.
+
+	  If in doubt, say Y.
 
 config EARLY_PRINTK
 	bool "Early printk" if EMBEDDED
-- 
cgit v1.2.3


From 2bdd1b031b200d55c2512c8d7e0e9bdcf85d011f Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Thu, 5 Jun 2008 14:14:41 -0700
Subject: PCI/x86: fix up PCI stuff so that PCI_GOANY supports OLPC

Previously, one would have to specifically choose CONFIG_OLPC and
CONFIG_PCI_GOOLPC in order to enable PCI_OLPC.  That doesn't really work
for distro kernels, so this patch allows one to choose CONFIG_OLPC and
CONFIG_PCI_GOANY in order to build in OLPC support in a generic kernel (as
requested by Robert Millan).

This also moves GOOLPC before GOANY in the menuconfig list.

Finally, make pci_access_init return early if we detect OLPC hardware.
There's no need to continue probing stuff, and pci_pcbios_init
specifically trashes our settings (we didn't run into that before because
PCI_GOANY wasn't supported).

Signed-off-by: Andres Salomon <dilinger@debian.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/Kconfig    | 11 +++++------
 arch/x86/pci/init.c |  3 ++-
 arch/x86/pci/olpc.c |  5 +++--
 arch/x86/pci/pci.h  |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcbec34154c..52e18e6d2ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1508,13 +1508,13 @@ config PCI_GOMMCONFIG
 config PCI_GODIRECT
 	bool "Direct"
 
-config PCI_GOANY
-	bool "Any"
-
 config PCI_GOOLPC
 	bool "OLPC"
 	depends on OLPC
 
+config PCI_GOANY
+	bool "Any"
+
 endchoice
 
 config PCI_BIOS
@@ -1531,9 +1531,8 @@ config PCI_MMCONFIG
 	depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
 
 config PCI_OLPC
-	bool
-	depends on PCI && PCI_GOOLPC
-	default y
+	def_bool y
+	depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
 
 config PCI_DOMAINS
 	def_bool y
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index e70b9c57b88..b821f4462d9 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -15,7 +15,8 @@ static __init int pci_access_init(void)
 	pci_mmcfg_early_init();
 
 #ifdef CONFIG_PCI_OLPC
-	pci_olpc_init();
+	if (!pci_olpc_init())
+		return 0;	/* skip additional checks if it's an XO */
 #endif
 #ifdef CONFIG_PCI_BIOS
 	pci_pcbios_init();
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 5e7636558c0..e11e9e803d5 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -302,12 +302,13 @@ static struct pci_raw_ops pci_olpc_conf = {
 	.write = pci_olpc_write,
 };
 
-void __init pci_olpc_init(void)
+int __init pci_olpc_init(void)
 {
 	if (!machine_is_olpc() || olpc_has_vsa())
-		return;
+		return -ENODEV;
 
 	printk(KERN_INFO "PCI: Using configuration type OLPC\n");
 	raw_pci_ops = &pci_olpc_conf;
 	is_lx = is_geode_lx();
+	return 0;
 }
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index f3972b12c60..720c4c55453 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -101,7 +101,7 @@ extern struct pci_raw_ops pci_direct_conf1;
 extern int pci_direct_probe(void);
 extern void pci_direct_init(int type);
 extern void pci_pcbios_init(void);
-extern void pci_olpc_init(void);
+extern int pci_olpc_init(void);
 
 /* pci-mmconfig.c */
 
-- 
cgit v1.2.3


From 9f67fd5db50566728996b0115a08c83d4f902cb3 Mon Sep 17 00:00:00 2001
From: Bertram Felgenhauer <int-e@gmx.de>
Date: Thu, 5 Jun 2008 15:31:22 -0700
Subject: x86/PCI: add workaround for bug in ASUS A7V600 BIOS (rev 1005)

This BIOS claims the VIA 8237 south bridge to be compatible with VIA 586,
which it is not.

Without this patch, I get the following warning while booting,
among others,

| PCI: Using IRQ router VIA [1106/3227] at 0000:00:11.0
| ------------[ cut here ]------------
| WARNING: at arch/x86/pci/irq.c:265 pirq_via586_get+0x4a/0x60()
| Modules linked in:
| Pid: 1, comm: swapper Not tainted 2.6.26-rc4-00015-g1ec7d99 #1
|  [<c0119fd4>] warn_on_slowpath+0x54/0x70
|  [<c02246e0>] ? vt_console_print+0x210/0x2b0
|  [<c02244d0>] ? vt_console_print+0x0/0x2b0
|  [<c011a413>] ? __call_console_drivers+0x43/0x60
|  [<c011a482>] ? _call_console_drivers+0x52/0x80
|  [<c011aa89>] ? release_console_sem+0x1c9/0x200
|  [<c0291d21>] ? raw_pci_read+0x41/0x70
|  [<c0291e8f>] ? pci_read+0x2f/0x40
|  [<c029151a>] pirq_via586_get+0x4a/0x60
|  [<c02914d0>] ? pirq_via586_get+0x0/0x60
|  [<c029178d>] pcibios_lookup_irq+0x15d/0x430
|  [<c03b895a>] pcibios_irq_init+0x17a/0x3e0
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c03a6763>] kernel_init+0x73/0x250
|  [<c03b87e0>] ? pcibios_irq_init+0x0/0x3e0
|  [<c0114d00>] ? schedule_tail+0x10/0x40
|  [<c0102dee>] ? ret_from_fork+0x6/0x1c
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c03a66f0>] ? kernel_init+0x0/0x250
|  [<c010324b>] kernel_thread_helper+0x7/0x1c
|  =======================
| ---[ end trace 4eaa2a86a8e2da22 ]---

and IRQ trouble later,

| irq 10: nobody cared (try booting with the "irqpoll" option)

Now that's an VIA 8237 chip, so pirq_via586_get shouldn't be called
at all; adding this workaround to via_router_probe() fixes the
problem for me.

Amazingly I have a 2.6.23.8 kernel that somehow works fine ... I'll
never understand why.

Signed-off-by: Bertram Felgenhauer <int-e@gmx.de>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0908fca901b..ca8df9c260b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r,
 			 */
 			device = PCI_DEVICE_ID_VIA_8235;
 			break;
+		case PCI_DEVICE_ID_VIA_8237:
+			/**
+			 * Asus a7v600 bios wrongly reports 8237
+			 * as 586-compatible
+			 */
+			device = PCI_DEVICE_ID_VIA_8237;
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From 33e3885de25148e00595c4dd808d6eb15db2edcf Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 21 May 2008 15:34:25 +0300
Subject: KVM: x86 emulator: fix hypercall return value on AMD

The hypercall instructions on Intel and AMD are different.  KVM allows the
guest to choose one or the other (the default is Intel), and if the guest
chooses incorrectly, KVM will patch it at runtime to select the correct
instruction.  This allows live migration between Intel and AMD machines.

This patching occurs in the x86 emulator.  The current code also executes
the hypercall.  Unfortunately, the tail end of the x86 emulator code also
executes, overwriting the return value of the hypercall with the original
contents of rax (which happens to be the hypercall number).

Fix not by executing the hypercall in the emulator context; instead let the
guest reissue the patched instruction and execute the hypercall via the
normal path.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8a96320ab07..932f216d890 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1727,7 +1727,8 @@ twobyte_insn:
 			if (rc)
 				goto done;
 
-			kvm_emulate_hypercall(ctxt->vcpu);
+			/* Let the processor re-execute the fixed hypercall */
+			c->eip = ctxt->vcpu->arch.rip;
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v1.2.3


From 2f5997140f22f68f6390c49941150d3fa8a95cb7 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 27 May 2008 12:10:20 -0300
Subject: KVM: migrate PIT timer

Migrate the PIT timer to the physical CPU which vcpu0 is scheduled on,
similarly to what is done for the LAPIC timers, otherwise PIT interrupts
will be delayed until an unrelated event causes an exit.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 14 +++++++++++++-
 arch/x86/kvm/irq.c   |  6 ++++++
 arch/x86/kvm/irq.h   |  2 ++
 arch/x86/kvm/svm.c   |  2 +-
 arch/x86/kvm/vmx.c   |  2 +-
 arch/x86/kvm/x86.c   |  2 +-
 6 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7c077a9d977..f2f5d260874 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,7 +200,6 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	atomic_inc(&pt->pending);
 	smp_mb__after_atomic_inc();
-	/* FIXME: handle case where the guest is in guest mode */
 	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
 		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		wake_up_interruptible(&vcpu0->wq);
@@ -237,6 +236,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		return HRTIMER_NORESTART;
 }
 
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+	struct hrtimer *timer;
+
+	if (vcpu->vcpu_id != 0 || !pit)
+		return;
+
+	timer = &pit->pit_state.pit_timer.timer;
+	if (hrtimer_cancel(timer))
+		hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
+
 static void destroy_pit_timer(struct kvm_kpit_timer *pt)
 {
 	pr_debug("pit: execute del timer!\n");
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ce1f583459b..76d736b5f66 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+	__kvm_migrate_apic_timer(vcpu);
+	__kvm_migrate_pit_timer(vcpu);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 1802134b836..2a15be2275c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
 
 int pit_has_pending_timer(struct kvm_vcpu *vcpu);
 int apic_has_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab22615eee8..6b0d5fa5bab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		delta = vcpu->arch.host_tsc - tsc_this;
 		svm->vmcb->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
-		kvm_migrate_apic_timer(vcpu);
+		kvm_migrate_timers(vcpu);
 	}
 
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe4db11989..96445f34151 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -608,7 +608,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
-		kvm_migrate_apic_timer(vcpu);
+		kvm_migrate_timers(vcpu);
 		vpid_sync_vcpu_all(vmx);
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21338bdb28f..00acf1301a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2758,7 +2758,7 @@ again:
 
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
-			__kvm_migrate_apic_timer(vcpu);
+			__kvm_migrate_timers(vcpu);
 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
 				       &vcpu->requests)) {
 			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
-- 
cgit v1.2.3


From e693d71b46e64536581bf4884434fc1b8797e96f Mon Sep 17 00:00:00 2001
From: Eli Collins <ecollins@vmware.com>
Date: Sun, 1 Jun 2008 20:24:40 -0700
Subject: KVM: VMX: Clear CR4.VMXE in hardware_disable

Clear CR4.VMXE in hardware_disable. There's no reason to leave it set
after doing a VMXOFF.

VMware Workstation 6.5 checks CR4.VMXE as a proxy for whether the CPU is
in VMX mode, so leaving VMXE set means we'll refuse to power on. With this
change the user can power on after unloading the kvm-intel module. I
tested on kvm-67 and kvm-69.

Signed-off-by: Eli Collins <ecollins@vmware.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 96445f34151..02efbe75f31 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1036,6 +1036,7 @@ static void hardware_enable(void *garbage)
 static void hardware_disable(void *garbage)
 {
 	asm volatile (ASM_VMX_VMXOFF : : : "cc");
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-- 
cgit v1.2.3


From 8d2d73b9a5c35f2c6abf427afba7888cfc4cc65d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 4 Jun 2008 18:42:24 +0300
Subject: KVM: MMU: reschedule during shadow teardown

Shadows for large guests can take a long time to tear down, so reschedule
occasionally to avoid softlockup warnings.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7246b60afb9..c2fd6a4a58c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1858,6 +1858,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_zap_page(vcpu->kvm, sp);
+		cond_resched();
 	}
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
 }
-- 
cgit v1.2.3


From ebb0e6264c7a65c51feb3575e9edb58eab0cf469 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 20 May 2008 16:21:58 +0300
Subject: KVM: MMU: Fix printk() format string

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10288a..934c7b61939 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	/* mmio */
 	if (is_error_pfn(pfn)) {
-		pgprintk("gfn %x is mmio\n", walker.gfn);
+		pgprintk("gfn %lx is mmio\n", walker.gfn);
 		kvm_release_pfn_clean(pfn);
 		return 1;
 	}
-- 
cgit v1.2.3


From 3c9155106d589584f67b026ec444e69c4a68d7dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 20 May 2008 16:21:13 +0300
Subject: KVM: MMU: Fix is_empty_shadow_page() check

The check is only looking at one of two possible empty ptes.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2fd6a4a58c..ee3f53098f0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -658,7 +658,7 @@ static int is_empty_shadow_page(u64 *spt)
 	u64 *end;
 
 	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-		if (*pos != shadow_trap_nonpresent_pte) {
+		if (is_shadow_present_pte(*pos)) {
 			printk(KERN_ERR "%s: %p %llx\n", __func__,
 			       pos, *pos);
 			return 0;
-- 
cgit v1.2.3


From b7f09ae583c49d28b2796d2fa5893dcf822e3a10 Mon Sep 17 00:00:00 2001
From: Miquel van Smoorenburg <mikevs@xs4all.net>
Date: Thu, 5 Jun 2008 18:14:44 +0200
Subject: x86, pci-dma.c: don't always add __GFP_NORETRY to gfp

Currently arch/x86/kernel/pci-dma.c always adds __GFP_NORETRY
to the allocation flags, because it wants to be reasonably
sure not to deadlock when calling alloc_pages().

But really that should only be done in two cases:
- when allocating memory in the lower 16 MB DMA zone.
  If there's no free memory there, waiting or OOM killing is of no use
- when optimistically trying an allocation in the DMA32 zone
  when dma_mask < DMA_32BIT_MASK hoping that the allocation
  happens to fall within the limits of the dma_mask

Also blindly adding __GFP_NORETRY to the the gfp variable might
not be a good idea since we then also use it when calling
dma_ops->alloc_coherent(). Clearing it might also not be a
good idea, dma_alloc_coherent()'s caller might have set it
on purpose. The gfp variable should not be clobbered.

[ mingo@elte.hu: converted to delta patch ontop of previous version. ]

Signed-off-by: Miquel van Smoorenburg <miquels@cistron.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-dma.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 069e843f0b9..dc00a1331ac 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -378,6 +378,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	struct page *page;
 	unsigned long dma_mask = 0;
 	dma_addr_t bus;
+	int noretry = 0;
 
 	/* ignore region specifiers */
 	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -397,19 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	if (dev->dma_mask == NULL)
 		return NULL;
 
+	/* Don't invoke OOM killer or retry in lower 16MB DMA zone */
+	if (gfp & __GFP_DMA)
+		noretry = 1;
+
 #ifdef CONFIG_X86_64
 	/* Why <=? Even when the mask is smaller than 4GB it is often
 	   larger than 16MB and in this case we have a chance of
 	   finding fitting memory in the next higher zone first. If
 	   not retry with true GFP_DMA. -AK */
-	if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
+	if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
 		gfp |= GFP_DMA32;
+		if (dma_mask < DMA_32BIT_MASK)
+			noretry = 1;
+	}
 #endif
 
  again:
-	/* Don't invoke OOM killer or retry in lower 16MB DMA zone */
 	page = dma_alloc_pages(dev,
-		(gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size));
+		noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
 	if (page == NULL)
 		return NULL;
 
-- 
cgit v1.2.3


From 39b8931b5cad9a7cbcd2394a40a088311e783a82 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 9 Jun 2008 16:48:18 -0700
Subject: ACPI: handle invalid ACPI SLIT table

This is a SLIT sanity checking patch.  It moves slit_valid() function to
generic ACPI code and does sanity checking for both x86 and ia64.  It sets up
node_distance with LOCAL_DISTANCE and REMOTE_DISTANCE when hitting invalid
SLIT table on ia64.  It also cleans up unused variable localities in
acpi_parse_slit() on x86.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/mm/srat_64.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 3890234e5b2..99649dccad2 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -97,36 +97,9 @@ static __init inline int srat_disabled(void)
 	return numa_off || acpi_numa < 0;
 }
 
-/*
- * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
- * up the NUMA heuristics which wants the local node to have a smaller
- * distance than the others.
- * Do some quick checks here and only use the SLIT if it passes.
- */
-static __init int slit_valid(struct acpi_table_slit *slit)
-{
-	int i, j;
-	int d = slit->locality_count;
-	for (i = 0; i < d; i++) {
-		for (j = 0; j < d; j++)  {
-			u8 val = slit->entry[d*i + j];
-			if (i == j) {
-				if (val != LOCAL_DISTANCE)
-					return 0;
-			} else if (val <= LOCAL_DISTANCE)
-				return 0;
-		}
-	}
-	return 1;
-}
-
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
-	if (!slit_valid(slit)) {
-		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
-		return;
-	}
 	acpi_slit = slit;
 }
 
-- 
cgit v1.2.3


From f595ec964daf7f99668039d7303ddedd09a75142 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Jun 2008 10:47:56 +0200
Subject: common implementation of iterative div/mod

We have a few instances of the open-coded iterative div/mod loop, used
when we don't expcet the dividend to be much bigger than the divisor.
Unfortunately modern gcc's have the tendency to strength "reduce" this
into a full mod operation, which isn't necessarily any faster, and
even if it were, doesn't exist if gcc implements it in libgcc.

The workaround is to put a dummy asm statement in the loop to prevent
gcc from performing the transformation.

This patch creates a single implementation of this loop, and uses it
to replace the open-coded versions I know about.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Christian Kujau <lists@nerdbynature.de>
Cc: Robert Hancock <hancockr@shaw.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/time.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa24..52b2e385698 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -12,6 +12,7 @@
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
 #include <linux/kernel_stat.h>
+#include <linux/math64.h>
 
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
@@ -150,11 +151,7 @@ static void do_stolen_accounting(void)
 	if (stolen < 0)
 		stolen = 0;
 
-	ticks = 0;
-	while (stolen >= NS_PER_TICK) {
-		ticks++;
-		stolen -= NS_PER_TICK;
-	}
+	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 	__get_cpu_var(residual_stolen) = stolen;
 	account_steal_time(NULL, ticks);
 
@@ -166,11 +163,7 @@ static void do_stolen_accounting(void)
 	if (blocked < 0)
 		blocked = 0;
 
-	ticks = 0;
-	while (blocked >= NS_PER_TICK) {
-		ticks++;
-		blocked -= NS_PER_TICK;
-	}
+	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 	__get_cpu_var(residual_blocked) = blocked;
 	account_steal_time(idle_task(smp_processor_id()), ticks);
 }
-- 
cgit v1.2.3


From 3703f39965a197ebd91743fc38d0f640606b8da3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 4 Jun 2008 18:13:37 +0200
Subject: geode: fix modular build

-tip testing found this build bug:

 MODPOST 331 modules
 ERROR: "geode_mfgpt_toggle_event" [drivers/watchdog/geodewdt.ko] undefined!
 ERROR: "geode_mfgpt_alloc_timer" [drivers/watchdog/geodewdt.ko] undefined!
 make[1]: *** [__modpost] Error 1
 make: *** [modules] Error 2

with this config:

  http://redhat.com/~mingo/misc/config-Wed_Jun__4_18_01_59_CEST_2008.bad

export those symbols.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mfgpt_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3cad17fe026..07c0f828f48 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -155,6 +155,7 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
 	wrmsr(msr, value, dummy);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
 
 int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
 {
@@ -222,6 +223,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain)
 	/* No timers available - too bad */
 	return -1;
 }
+EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
 
 
 #ifdef CONFIG_GEODE_MFGPT_TIMER
-- 
cgit v1.2.3


From b29c701deacd5d24453127c37ed77ef851c53b8b Mon Sep 17 00:00:00 2001
From: Henry Nestler <henry.nestler@gmail.com>
Date: Mon, 12 May 2008 15:44:39 +0200
Subject: x86: fix endless page faults in mount_block_root for Linux 2.6

Page faults in kernel address space between PAGE_OFFSET up to
VMALLOC_START should not try to map as vmalloc.

Fix rarely endless page faults inside mount_block_root for root
filesystem at boot time.

All 32bit kernels up to 2.6.25 can fail into this hole.
I can not present this under native linux kernel. I see, that the 64bit
has fixed the problem. I copied the same lines into 32bit part.

Recorded debugs are from coLinux kernel 2.6.22.18 (virtualisation):
http://www.henrynestler.com/colinux/testing/pfn-check-0.7.3/20080410-antinx/bug16-recursive-page-fault-endless.txt
The physicaly memory was trimmed down to 192MB to better catch the bug.
More memory gets the bug more rarely.

Details, how every x86 32bit system can fail:

Start from "mount_block_root",
http://lxr.linux.no/linux/init/do_mounts.c#L297
There the variable "fs_names" got one memory page with 4096 bytes.
Variable "p" walks through the existing file system types. The first
string is no problem.
But, with the second loop in mount_block_root the offset of "p" is not
at beginning of page, the offset is for example +9, if "reiserfs" is the
first in list.
Than calls do_mount_root, and lands in sys_mount.
Remember: Variable "type_page" contains now "fs_type+9" and not contains
a full page.
The sys_mount copies 4096 bytes with function "exact_copy_from_user()":
http://lxr.linux.no/linux/fs/namespace.c#L1540

Mostly exist pages after the buffer "fs_names+4096+9" and the page fault
handler was not called. No problem.

In the case, if the page after "fs_names+4096" is not mapped, the page
fault handler was called from http://lxr.linux.no/linux/fs/namespace.c#L1320

The do_page_fault gots an address 0xc03b4000.
It's kernel address, address >= TASK_SIZE, but not from vmalloc! It's
from "__getname()" alias "kmem_cache_alloc".
The "error_code" is 0. "vmalloc_fault" will be call:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L332

"vmalloc_fault" tryed to find the physical page for a non existing
virtual memory area. The macro "pte_present" in vmalloc_fault()
got a next page fault for 0xc0000ed0 at:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L282

No PTE exist for such virtual address. The page fault handler was trying
to sync the physical page for the PTE lockup.

This called vmalloc_fault() again for address 0xc000000, and that also
was not existing. The endless began...

In normal case the cpu would still loop with disabled interrrupts. Under
coLinux this was catched by a stack overflow inside printk debugs.

Signed-off-by: Henry Nestler <henry.nestler@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75..8bcb6f40ccb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -497,6 +497,11 @@ static int vmalloc_fault(unsigned long address)
 	unsigned long pgd_paddr;
 	pmd_t *pmd_k;
 	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
 	/*
 	 * Synchronize this task's top level page-table
 	 * with the 'reference' page table.
-- 
cgit v1.2.3


From 86b2b70e156203149c3861455feec54bc4906e6d Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Mon, 2 Jun 2008 17:21:06 -0400
Subject: x86: fix asm warning in head_32.S

On Mon, May 19, 2008 at 04:10:02PM -0700, Linus Torvalds wrote:
> It also causes these warnings on 32-bit PAE:
>
> 	  AS      arch/x86/kernel/head_32.o
> 	arch/x86/kernel/head_32.S: Assembler messages:
> 	arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed
> 	arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed
>
> and I do not see why (the end result seems to be identical).

Fix head_32.S gcc bignum warnings when CONFIG_PAE=y.

    arch/x86/kernel/head_32.S: Assembler messages:
    arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed
    arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed

The assembler was stumbling over the 64-bit constant 0x100000000 in the
KPMDS #define.

Testing: a cmp(1) on head_32.o before and after shows the binary is unchanged.

Signed-off-by: Joe Korty <joe.korty@ccur.com
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Gabriel C <nix.or.die@googlemail.com>
Cc: Keith Packard <keithp@keithp.com>
Cc: "Pallipadi Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Eric Anholt <eric@anholt.net>
Cc: "Siddha Suresh B" <suresh.b.siddha@intel.com>
Cc: bugme-daemon@bugzilla.kernel.org
Cc: airlied@linux.ie
Cc: "Barnes Jesse" <jesse.barnes@intel.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b2cc73768a9..f7357cc0162 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -189,7 +189,7 @@ default_entry:
 	 * this stage.
 	 */
 
-#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
 
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
-- 
cgit v1.2.3


From 0b6a39f7ebcb1c82587ce35b401c513eed41ac5c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jun 2008 13:29:43 +0200
Subject: Revert "x86: fix ioapic bug again"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 6e908947b4995bc0e551a8257c586d5c3e428201.

Németh Márton reported:

| there is a problem in 2.6.26-rc3 which was not there in case of
| 2.6.25: the CPU wakes up ~90,000 times per sec instead of ~60 per sec.
|
| I also "git bisected" the problem, the result is:
|
| 6e908947b4995bc0e551a8257c586d5c3e428201 is first bad commit
| commit 6e908947b4995bc0e551a8257c586d5c3e428201
| Author: Ingo Molnar <mingo@elte.hu>
| Date:   Fri Mar 21 14:32:36 2008 +0100
|
|     x86: fix ioapic bug again

the original problem is fixed by Maciej W. Rozycki in the tip/x86/apic
branch (confirmed by Márton), but those changes are too intrusive for
v2.6.26 so we'll go for the less intrusive (repeated) revert now.

Reported-and-bisected-by: Németh Márton <nm127@freemail.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_32.c | 12 ++----------
 arch/x86/kernel/nmi_32.c     |  9 ++-------
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fd..4dc8600d9d2 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2130,14 +2130,10 @@ static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
-	unsigned int ver;
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	ver = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(ver);
-
 	/*
 	 * get/set the timer IRQ vector:
 	 */
@@ -2150,15 +2146,11 @@ static inline void __init check_timer(void)
 	 * mode for the 8259A whenever interrupts are routed
 	 * through I/O APICs.  Also IRQ0 has to be enabled in
 	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.  Finally timer interrupts
-	 * need to be acknowledged manually in the 8259A for
-	 * timer_interrupt() and for the i82489DX when using
-	 * the NMI watchdog.
+	 * disabled in the local APIC.
 	 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = !cpu_has_tsc;
-	timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+	timer_ack = 1;
 	if (timer_over_8254 > 0)
 		enable_8259A_irq(0);
 
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 11b14bbaa61..84160f74eeb 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -26,7 +26,6 @@
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
-#include <asm/timer.h>
 
 #include "mach_traps.h"
 
@@ -82,7 +81,7 @@ int __init check_nmi_watchdog(void)
 
 	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
-		goto error;
+		return -1;
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
@@ -119,7 +118,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		goto error;
+		return -1;
 	}
 	printk("OK.\n");
 
@@ -130,10 +129,6 @@ int __init check_nmi_watchdog(void)
 
 	kfree(prev_nmi_count);
 	return 0;
-error:
-	timer_ack = !cpu_has_tsc;
-
-	return -1;
 }
 
 static int __init setup_nmi_watchdog(char *str)
-- 
cgit v1.2.3


From 52aaa12fbe786c90396f1b11ec39c924ccdd8fd5 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Thu, 5 Jun 2008 19:14:15 +0530
Subject: x86: fix unused variable 'loops' warning in arch/x86/boot/a20.c

Following patch fixes the below warning message :
arch/x86/boot/a20.c:118: warning: unused variable 'loops'

Signed-off-by : Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/a20.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 90943f83e84..e01aafd03bd 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -115,8 +115,6 @@ static void enable_a20_fast(void)
 
 int enable_a20(void)
 {
-	int loops = A20_ENABLE_LOOPS;
-
 #if defined(CONFIG_X86_ELAN)
 	/* Elan croaks if we try to touch the KBC */
 	enable_a20_fast();
@@ -128,6 +126,7 @@ int enable_a20(void)
 	enable_a20_kbc();
 	return 0;
 #else
+       int loops = A20_ENABLE_LOOPS;
 	while (loops--) {
 		/* First, check to see if A20 is already enabled
 		   (legacy free, etc.) */
-- 
cgit v1.2.3


From e32e58a96de4ac35a03349db2ab69f263ded958f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 6 Jun 2008 10:14:08 +0200
Subject: x86: fix lockdep warning during suspend-to-ram

Andrew Morton wrote:

> I've been seeing the below for a long time during suspend-to-ram on the Vaio.
>
>
> PM: Syncing filesystems ... done.
> PM: Preparing system for mem sleep
> Freezing user space processes ... <4>------------[ cut here ]------------
> WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x127()
> Modules linked in: i915 drm ipw2200 sonypi ipv6 autofs4 hidp l2cap bluetooth sunrpc nf_conntrack_netbios_ns ipt_REJECT nf_conntrack_ipv4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables acpi_cpufreq nvram ohci1394 ieee1394 ehci_hcd uhci_hcd sg joydev snd_hda_intel snd_seq_dummy sr_mod snd_seq_oss cdrom snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss ieee80211 pcspkr ieee80211_crypt snd_pcm i2c_i801 snd_timer i2c_core ide_pci_generic piix snd soundcore snd_page_alloc button ext3 jbd ide_disk ide_core [last unloaded: ipw2200]
> Pid: 3250, comm: zsh Not tainted 2.6.26-rc5 #1
>  [<c011c5f5>] warn_on_slowpath+0x41/0x6d
>  [<c01080e6>] ? native_sched_clock+0x82/0x96
>  [<c013789c>] ? mark_held_locks+0x41/0x5c
>  [<c0315688>] ? _spin_unlock_irqrestore+0x36/0x58
>  [<c0137a29>] ? trace_hardirqs_on+0xe6/0x10d
>  [<c0138637>] ? __lock_acquire+0xae3/0xb2b
>  [<c0313413>] ? schedule+0x39b/0x3b4
>  [<c0135596>] check_flags+0x4c/0x127
>  [<c01386b9>] lock_acquire+0x3a/0x86
>  [<c0315075>] _spin_lock+0x26/0x53
>  [<c0140660>] ? refrigerator+0x13/0xc3
>  [<c0140660>] refrigerator+0x13/0xc3
>  [<c012684a>] get_signal_to_deliver+0x3c/0x31e
>  [<c0102fe7>] do_notify_resume+0x91/0x6ee
>  [<c01359fd>] ? lock_release_holdtime+0x50/0x56
>  [<c0315688>] ? _spin_unlock_irqrestore+0x36/0x58
>  [<c0235d24>] ? read_chan+0x0/0x58c
>  [<c0137a29>] ? trace_hardirqs_on+0xe6/0x10d
>  [<c0315694>] ? _spin_unlock_irqrestore+0x42/0x58
>  [<c0230afa>] ? tty_ldisc_deref+0x5c/0x63
>  [<c0233104>] ? tty_read+0x66/0x98
>  [<c014b3f0>] ? audit_syscall_exit+0x2aa/0x2c5
>  [<c0109430>] ? do_syscall_trace+0x6b/0x16f
>  [<c0103a9c>] work_notifysig+0x13/0x1b
>  =======================
> ---[ end trace 25b49fe59a25afa5 ]---
> possible reason: unannotated irqs-off.
> irq event stamp: 58919
> hardirqs last  enabled at (58919): [<c0103afd>] syscall_exit_work+0x11/0x26

Joy - I so love entry.S

Best I can make of it:

syscall_exit_work
  resume_userspace
    DISABLE_INTERRUPTS
    (no TRACE_IRQS_OFF)
      work_pending
        work_notifysig
          do_notify_resume()
            do_signal()
              get_signal_to_deliver()
                try_to_freeze()
                  refrigerator()
                    task_lock() -> check_flags() -> BANG

The normal path is:

syscall_exit_work
  resume_userspace
    DISABLE_INTERRUPTS
    restore_all
      TRACE_IRQS_IRET
      iret

No idea why that would not warn..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271..c778e4fa55a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -248,6 +248,7 @@ ENTRY(resume_userspace)
  	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
 					# int/exception return?
-- 
cgit v1.2.3


From eb53e9f3ea859a6d59c37b500593b970aa8562e6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 7 Jun 2008 17:18:40 +0100
Subject: x86: fix an incompatible pointer type warning on 64-bit compilations

Fix an incompatible pointer type warning on x86_64 compilations.
early_memtest() is passing a u64* to find_e820_area_size() which is expecting
an unsigned long.  Change t_start and t_size to unsigned long as those are
also 64-bit types on x88_64.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e3..998a06ea5f7 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
 
 static void __init early_memtest(unsigned long start, unsigned long end)
 {
-	u64 t_start, t_size;
+	unsigned long t_start, t_size;
 	unsigned pattern;
 
 	if (!memtest_pattern)
@@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			if (t_start + t_size > end)
 				t_size = end - t_start;
 
-			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
+			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
 				t_start, t_start + t_size, pattern);
 
 			memtest(t_start, t_size, pattern);
-- 
cgit v1.2.3


From 4461145ef1be92851c230f858f6b6f457c99670f Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 12 Jun 2008 08:55:59 +0200
Subject: x86, lockdep: fix "WARNING: at kernel/lockdep.c:2658
 check_flags+0x4c/0x128()"

Alessandro Suardi reported:
> Recently upgraded my FC6 desktop to Fedora 9; with the
>  latest nautilus RPM updates my VNC session went nuts
>  with nautilus pegging the CPU for everything that breathed.
>
> I now reverted to an earlier nautilus package, but during
>  the peak CPU period my kernel spat this:
>
> [314185.623294] ------------[ cut here ]------------
> [314185.623414] WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x128()
> [314185.623514] Modules linked in: iptable_filter ip_tables x_tables
> sunrpc ipv6 fuse snd_via82xx snd_ac97_codec ac97_bus snd_mpu401_uart
> snd_rawmidi via686a hwmon parport_pc sg parport uhci_hcd ehci_hcd
> [314185.623924] Pid: 12314, comm: nautilus Not tainted 2.6.26-rc5-git2 #4
> [314185.624021]  [<c0115b95>] warn_on_slowpath+0x41/0x7b
> [314185.624021]  [<c010de70>] ? do_page_fault+0x2c1/0x5fd
> [314185.624021]  [<c0128396>] ? up_read+0x16/0x28
> [314185.624021]  [<c010de70>] ? do_page_fault+0x2c1/0x5fd
> [314185.624021]  [<c012fa33>] ? __lock_acquire+0xbb4/0xbc3
> [314185.624021]  [<c012d0a0>] check_flags+0x4c/0x128
> [314185.624021]  [<c012fa73>] lock_acquire+0x31/0x7d
> [314185.624021]  [<c0128cf6>] __atomic_notifier_call_chain+0x30/0x80
> [314185.624021]  [<c0128cc6>] ? __atomic_notifier_call_chain+0x0/0x80
> [314185.624021]  [<c0128d52>] atomic_notifier_call_chain+0xc/0xe
> [314185.624021]  [<c0128d81>] notify_die+0x2d/0x2f
> [314185.624021]  [<c01043b0>] do_int3+0x1f/0x4d
> [314185.624021]  [<c02f2d3b>] int3+0x27/0x2c
> [314185.624021]  =======================
> [314185.624021] ---[ end trace 1923f65a2d7bb246 ]---
> [314185.624021] possible reason: unannotated irqs-off.
> [314185.624021] irq event stamp: 488879
> [314185.624021] hardirqs last  enabled at (488879): [<c0102d67>]
> restore_nocheck+0x12/0x15
> [314185.624021] hardirqs last disabled at (488878): [<c0102dca>]
> work_resched+0x19/0x30
> [314185.624021] softirqs last  enabled at (488876): [<c011a1ba>]
> __do_softirq+0xa6/0xac
> [314185.624021] softirqs last disabled at (488865): [<c010476e>]
> do_softirq+0x57/0xa6
>
> I didn't seem to find it with some googling, so here it is.
>
> I was incidentally ltracing that process to try and find out
>  what was gulping down that much CPU (sorry, no idea
>  whether ltrace and the WARNING happened at the same
>  time or which came first) and:

Yeah, this is extremely likely to be the source of the warning.

The warning should be harmless, however.

> Box is my trusty noname K7-800, 512MB RAM; if there's
>  anything else useful I might be able to provide, just ask.

It would be interesting to see where the int3 comes from.  Too bad,
lockdep doesn't provide the register dump. The stacktrace also doesn't
go further than the int3(), I wonder if this int3 came from userspace?
The ltrace readme says "software breakpoints, like gdb", so I guess
this is the case. Yep, seems like it.

This looks relevant:

| commit fb1dac909d94ff807cd833d340c6827c3a957159
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date:   Wed Jan 16 09:51:59 2008 +0100
|
|     lockdep: more hardirq annotations for notify_die()

I'm attaching a similarly-looking patch for this case (DO_VM86_ERROR),
though I suspect it might be missing for the other cases
(DO_ERROR/DO_ERROR_INFO) as well.

Reported-by: Alessandro Suardi <alessandro.suardi@gmail.com>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index bde6f63e15d..08d752de4ee 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -544,6 +544,7 @@ vm86_trap:
 #define DO_ERROR(trapnr, signr, str, name)				\
 void do_##name(struct pt_regs *regs, long error_code)			\
 {									\
+	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
 						== NOTIFY_STOP)		\
 		return;							\
-- 
cgit v1.2.3


From f8a45704f5bd5f037c8e4a75172cab1476fc0447 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Thu, 29 May 2008 21:14:35 -0300
Subject: x86: fix pointer type warning in arch/x86/mm/init_64.c:early_memtest

Changed the call to find_e820_area_size to pass u64 instead of unsigned long.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 998a06ea5f7..156e6d7b0e3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
 
 static void __init early_memtest(unsigned long start, unsigned long end)
 {
-	unsigned long t_start, t_size;
+	u64 t_start, t_size;
 	unsigned pattern;
 
 	if (!memtest_pattern)
@@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			if (t_start + t_size > end)
 				t_size = end - t_start;
 
-			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
+			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
 				t_start, t_start + t_size, pattern);
 
 			memtest(t_start, t_size, pattern);
-- 
cgit v1.2.3


From 1da2e3d679a8ea2d9e82040359a706da0bd3bef6 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@aknet.ru>
Date: Thu, 12 Jun 2008 15:21:54 -0700
Subject: provide rtc_cmos platform device

Recently (around 2.6.25) I've noticed that RTC no longer works for me.  It
turned out this is because I use pnpacpi=off kernel option to work around
the parport_pc bugs.  I always did so, but RTC used to work fine in the
past, and now it have regressed.

The patch fixes the problem by creating the platform device for the RTC
when PNP is disabled.  This may also help running the PNP-enabled kernel
on an older PCs.

Signed-off-by: Stas Sergeev <stsp@aknet.ru>
Cc: David Brownell <david-b@pacbell.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Adam Belay <ambx1@neo.rr.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/rtc.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 9615eee9b77..05191bbc68b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -4,6 +4,8 @@
 #include <linux/acpi.h>
 #include <linux/bcd.h>
 #include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+#include <linux/pnp.h>
 
 #include <asm/time.h>
 #include <asm/vsyscall.h>
@@ -197,3 +199,35 @@ unsigned long long native_read_tsc(void)
 }
 EXPORT_SYMBOL(native_read_tsc);
 
+
+static struct resource rtc_resources[] = {
+	[0] = {
+		.start	= RTC_PORT(0),
+		.end	= RTC_PORT(1),
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		.start	= RTC_IRQ,
+		.end	= RTC_IRQ,
+		.flags	= IORESOURCE_IRQ,
+	}
+};
+
+static struct platform_device rtc_device = {
+	.name		= "rtc_cmos",
+	.id		= -1,
+	.resource	= rtc_resources,
+	.num_resources	= ARRAY_SIZE(rtc_resources),
+};
+
+static __init int add_rtc_cmos(void)
+{
+#ifdef CONFIG_PNP
+	if (!pnp_platform_devices)
+		platform_device_register(&rtc_device);
+#else
+	platform_device_register(&rtc_device);
+#endif /* CONFIG_PNP */
+	return 0;
+}
+device_initcall(add_rtc_cmos);
-- 
cgit v1.2.3


From 42a886af728c089df8da1b0017b0e7e6c81b5335 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jun 2008 17:47:50 -0700
Subject: x86-64: Fix "bytes left to copy" return value for copy_from_user()

Most users by far do not care about the exact return value (they only
really care about whether the copy succeeded in its entirety or not),
but a few special core routines actually care deeply about exactly how
many bytes were copied from user space.

And the unrolled versions of the x86-64 user copy routines would
sometimes report that it had copied more bytes than it actually had.

Very few uses actually have partial copies to begin with, but to make
this bug even harder to trigger, most x86 CPU's use the "rep string"
instructions for normal user copies, and that version didn't have this
issue.

To make it even harder to hit, the one user of this that really cared
about the return value (and used the uncached version of the copy that
doesn't use the "rep string" instructions) was the generic write
routine, which pre-populated its source, once more hiding the problem by
avoiding the exception case that triggers the bug.

In other words, very special thanks to Bron Gondwana who not only
triggered this, but created a test-program to show it, and bisected the
behavior down to commit 08291429cfa6258c4cd95d8833beb40f828b194e ("mm:
fix pagecache write deadlocks") which changed the access pattern just
enough that you can now trigger it with 'writev()' with multiple
iovec's.

That commit itself was not the cause of the bug, it just allowed all the
stars to align just right that you could trigger the problem.

[ Side note: this is just the minimal fix to make the copy routines
  (with __copy_from_user_inatomic_nocache as the particular version that
  was involved in showing this) have the right return values.

  We really should improve on the exceptional case further - to make the
  copy do a byte-accurate copy up to the exact page limit that causes it
  to fail.  As it is, the callers have to do extra work to handle the
  limit case gracefully. ]

Reported-by: Bron Gondwana <brong@fastmail.fm>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

 (which didn't have this problem), and since
most users that do the carethis was very hard to trigger, but
---
 arch/x86/lib/copy_user_64.S         | 25 +++++++++++--------------
 arch/x86/lib/copy_user_nocache_64.S | 25 +++++++++++--------------
 2 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd31040..ee1c3f63515 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled)
 	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
-	.quad .Ls1,.Ls1e
-	.quad .Ls2,.Ls2e
-	.quad .Ls3,.Ls3e
-	.quad .Ls4,.Ls4e
-	.quad .Ld1,.Ls1e
+	.quad .Ls1,.Ls1e	/* Ls1-Ls4 have copied zero bytes */
+	.quad .Ls2,.Ls1e
+	.quad .Ls3,.Ls1e
+	.quad .Ls4,.Ls1e
+	.quad .Ld1,.Ls1e	/* Ld1-Ld4 have copied 0-24 bytes */
 	.quad .Ld2,.Ls2e
 	.quad .Ld3,.Ls3e
 	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e
-	.quad .Ls6,.Ls6e
-	.quad .Ls7,.Ls7e
-	.quad .Ls8,.Ls8e
-	.quad .Ld5,.Ls5e
+	.quad .Ls5,.Ls5e	/* Ls5-Ls8 have copied 32 bytes */
+	.quad .Ls6,.Ls5e
+	.quad .Ls7,.Ls5e
+	.quad .Ls8,.Ls5e
+	.quad .Ld5,.Ls5e	/* Ld5-Ld8 have copied 32-56 bytes */
 	.quad .Ld6,.Ls6e
 	.quad .Ld7,.Ls7e
 	.quad .Ld8,.Ls8e
@@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled)
 	.quad .Le5,.Le_zero
 	.previous
 
-	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
-	   pessimistic side. this is gross. it would be better to fix the
-	   interface. */
 	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax
+.Ls1e: 	addl $8,%eax		/* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
 .Ls2e: 	addl $8,%eax
 .Ls3e: 	addl $8,%eax
 .Ls4e: 	addl $8,%eax
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 5196762b3b0..9d3d1ab8376 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache)
 	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
-	.quad .Ls1,.Ls1e
-	.quad .Ls2,.Ls2e
-	.quad .Ls3,.Ls3e
-	.quad .Ls4,.Ls4e
-	.quad .Ld1,.Ls1e
+	.quad .Ls1,.Ls1e	/* .Ls[1-4] - 0 bytes copied */
+	.quad .Ls2,.Ls1e
+	.quad .Ls3,.Ls1e
+	.quad .Ls4,.Ls1e
+	.quad .Ld1,.Ls1e	/* .Ld[1-4] - 0..24 bytes coped */
 	.quad .Ld2,.Ls2e
 	.quad .Ld3,.Ls3e
 	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e
-	.quad .Ls6,.Ls6e
-	.quad .Ls7,.Ls7e
-	.quad .Ls8,.Ls8e
-	.quad .Ld5,.Ls5e
+	.quad .Ls5,.Ls5e	/* .Ls[5-8] - 32 bytes copied */
+	.quad .Ls6,.Ls5e
+	.quad .Ls7,.Ls5e
+	.quad .Ls8,.Ls5e
+	.quad .Ld5,.Ls5e	/* .Ld[5-8] - 32..56 bytes copied */
 	.quad .Ld6,.Ls6e
 	.quad .Ld7,.Ls7e
 	.quad .Ld8,.Ls8e
@@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache)
 	.quad .Le5,.Le_zero
 	.previous
 
-	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
-	   pessimistic side. this is gross. it would be better to fix the
-	   interface. */
 	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax
+.Ls1e: 	addl $8,%eax	/* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
 .Ls2e: 	addl $8,%eax
 .Ls3e: 	addl $8,%eax
 .Ls4e: 	addl $8,%eax
-- 
cgit v1.2.3


From 75118a82e21cafb4a82b53bb85d1c7689787e046 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 13 Jun 2008 15:47:12 -0700
Subject: x86: fix NULL pointer deref in __switch_to

Patrick McHardy reported a crash:

> > I get this oops once a day, its apparently triggered by something
> > run by cron, but the process is a different one each time.
> >
> > Kernel is -git from yesterday shortly before the -rc6 release
> > (last commit is the usb-2.6 merge, the x86 patches are missing),
> > .config is attached.
> >
> > I'll retry with current -git, but the patches that have gone in
> > since I last updated don't look related.
> >
> > [62060.043009] BUG: unable to handle kernel NULL pointer dereference at
> > 000001ff
> > [62060.043009] IP: [<c0102a9b>] __switch_to+0x2f/0x118
> > [62060.043009] *pde = 00000000
> > [62060.043009] Oops: 0002 [#1] PREEMPT

Vegard Nossum analyzed it:

> This decodes to
>
>    0:   0f ae 00                fxsave (%eax)
>
> so it's related to the floating-point context. This is the exact
> location of the crash:
>
> $ addr2line -e arch/x86/kernel/process_32.o -i ab0
> include/asm/i387.h:232
> include/asm/i387.h:262
> arch/x86/kernel/process_32.c:595
>
> ...so it looks like prev_task->thread.xstate->fxsave has become NULL.
> Or maybe it never had any other value.

Somehow (as described below) TS_USEDFPU is set but the fpu is not
allocated or freed.

Another possible FPU pre-emption issue with the sleazy FPU optimization
which was benign before but not so anymore, with the dynamic FPU allocation
patch.

New task is getting exec'd and it is prempted at the below point.

flush_thread() {
	...
	/*
	* Forget coprocessor state..
	*/
	clear_fpu(tsk);
		<----- Preemption point
	clear_used_math();
	...
}

Now when it context switches in again, as the used_math() is still set
and fpu_counter can be > 5, we will do a math_state_restore() which sets
the task's TS_USEDFPU. After it continues from the above preemption point
it does clear_used_math() and much later free_thread_xstate().

Now, at the next context switch, it is quite possible that xstate is
null, used_math() is not set and TS_USEDFPU is still set. This will
trigger unlazy_fpu() causing kernel oops.

Fix this  by clearing tsk's fpu_counter before clearing task's fpu.

Reported-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 1 +
 arch/x86/kernel/process_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6d5483356e7..e2db9ac5c61 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -333,6 +333,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ac54ff56df8..c6eb5c91e5f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -294,6 +294,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
-- 
cgit v1.2.3


From df17b1d990fc214f033c5588e58216ec941591e0 Mon Sep 17 00:00:00 2001
From: Mikael Pettersson <mikpe@it.uu.se>
Date: Sun, 15 Jun 2008 02:19:56 +0200
Subject: x86, 32-bit: fix boot failure on TSC-less processors

Booting 2.6.26-rc6 on my 486 DX/4 fails with a "BUG: Int 6"
(invalid opcode) and a kernel halt immediately after the
kernel has been uncompressed. The BUG shows EIP pointing
to an rdtsc instruction in native_read_tsc(), invoked from
native_sched_clock().

(This error occurs so early that not even the serial console
can capture it.)

A bisection showed that this bug first occurs in 2.6.26-rc3-git7,
via commit 9ccc906c97e34fd91dc6aaf5b69b52d824386910:

>x86: distangle user disabled TSC from unstable
>
>tsc_enabled is set to 0 from the command line switch "notsc" and from
>the mark_tsc_unstable code. Seperate those functionalities and replace
>tsc_enable with tsc_disable. This makes also the native_sched_clock()
>decision when to use TSC understandable.
>
>Preparatory patch to solve the sched_clock() issue on 32 bit.
>
>Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

The core reason for this bug is that native_sched_clock() gets
called before tsc_init().

Before the commit above, tsc_32.c used a "tsc_enabled" variable
which defaulted to 0 == disabled, and which only got enabled late
in tsc_init(). Thus early calls to native_sched_clock() would skip
the TSC and use jiffies instead.

After the commit above, tsc_32.c uses a "tsc_disabled" variable
which defaults to 0, meaning that the TSC is Ok to use. Early calls
to native_sched_clock() now erroneously try to use the TSC on
!cpu_has_tsc processors, leading to invalid opcode exceptions.

My proposed fix is to initialise tsc_disabled to a "soft disabled"
state distinct from the hard disabled state set up by the "notsc"
kernel option. This fixes the native_sched_clock() problem. It also
allows tsc_init() to be simplified: instead of setting tsc_disabled = 1
on every error return, we just set tsc_disabled = 0 once when all
checks have succeeded.

I've verified that this lets my 486 boot again. I've also verified
that a Core2 machine still uses the TSC as clocksource after the patch.

Signed-off-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_32.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 068759db63d..65b70637ad9 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,10 @@
 
 #include "mach_timer.h"
 
-static int tsc_disabled;
+/* native_sched_clock() is called before tsc_init(), so
+   we must start with the TSC soft disabled to prevent
+   erroneous rdtsc usage on !cpu_has_tsc processors */
+static int tsc_disabled = -1;
 
 /*
  * On some systems the TSC frequency does not
@@ -402,25 +405,20 @@ void __init tsc_init(void)
 {
 	int cpu;
 
-	if (!cpu_has_tsc || tsc_disabled) {
-		/* Disable the TSC in case of !cpu_has_tsc */
-		tsc_disabled = 1;
+	if (!cpu_has_tsc || tsc_disabled > 0)
 		return;
-	}
 
 	cpu_khz = calculate_cpu_khz();
 	tsc_khz = cpu_khz;
 
 	if (!cpu_khz) {
 		mark_tsc_unstable("could not calculate TSC khz");
-		/*
-		 * We need to disable the TSC completely in this case
-		 * to prevent sched_clock() from using it.
-		 */
-		tsc_disabled = 1;
 		return;
 	}
 
+	/* now allow native_sched_clock() to use rdtsc */
+	tsc_disabled = 0;
+
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
-- 
cgit v1.2.3


From d3942cff620bea073fc4e3c8ed878eb1e84615ce Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Sun, 8 Jun 2008 16:16:07 +0200
Subject: x86: use BOOTMEM_EXCLUSIVE on 32-bit

This patch uses the BOOTMEM_EXCLUSIVE for crashkernel reservation also for
i386 and prints a error message on failure.

The patch is still for 2.6.26 since it is only bug fixing. The unification
of reserve_crashkernel() between i386 and x86_64 should be done for 2.6.27.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/setup_32.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e8..5a2f8e06388 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)
 					(unsigned long)(crash_size >> 20),
 					(unsigned long)(crash_base >> 20),
 					(unsigned long)(total_mem >> 20));
+
+			if (reserve_bootmem(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+				printk(KERN_INFO "crashkernel reservation "
+					"failed - memory is in use\n");
+				return;
+			}
+
 			crashk_res.start = crash_base;
 			crashk_res.end   = crash_base + crash_size - 1;
-			reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_DEFAULT);
 		} else
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
-- 
cgit v1.2.3


From ffe6e1da86d21d7855495b5a772c93f050258f6e Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Wed, 18 Jun 2008 11:34:38 -0600
Subject: x86, geode: add a VSA2 ID for General Software

General Software writes their own VSA2 module for their version
of the Geode BIOS, which returns a different ID then the standard
VSA2.  This was causing the framebuffer driver to break for most
GSW boards.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Cc: tglx@linutronix.de
Cc: linux-geode@lists.infradead.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/geode_32.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab00..9b08e852fd1 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
 	static int has_vsa2 = -1;
 
 	if (has_vsa2 == -1) {
+		u16 val;
+
 		/*
 		 * The VSA has virtual registers that we can query for a
 		 * signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
 		outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
 		outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
 
-		has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG);
+		val = inw(VSA_VRC_DATA);
+		has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
 	}
 
 	return has_vsa2;
-- 
cgit v1.2.3


From 05345b0f006ac226d0d25d48fcb2d792ac44a071 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 16 Jun 2008 15:01:53 -0700
Subject: xen: mask unwanted pte bits in __supported_pte_mask

[ Stable: this isn't a bugfix in itself, but it's a pre-requiste
  for "xen: don't drop NX bit" ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
Cc: the arch/x86 maintainers <x86@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 5 +++++
 arch/x86/xen/mmu.c       | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d6..c048de34d6a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1228,6 +1228,11 @@ asmlinkage void __init xen_start_kernel(void)
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
 		pv_info.kernel_rpl = 0;
 
+	/* Prevent unwanted bits from being set in PTEs. */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
+	if (!is_initial_xendomain())
+		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
 	/* set the limit of our address space */
 	xen_reserve_top();
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3525ef523a7..3f2a67fe6ad 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -199,10 +199,8 @@ pgdval_t xen_pgd_val(pgd_t pgd)
 
 pte_t xen_make_pte(pteval_t pte)
 {
-	if (pte & _PAGE_PRESENT) {
+	if (pte & _PAGE_PRESENT)
 		pte = phys_to_machine(XPADDR(pte)).maddr;
-		pte &= ~(_PAGE_PCD | _PAGE_PWT);
-	}
 
 	return (pte_t){ .pte = pte };
 }
-- 
cgit v1.2.3


From ebb9cfe20fe167f29960a5e913193a684fac50bf Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 16 Jun 2008 15:01:56 -0700
Subject: xen: don't drop NX bit

Because NX is now enforced properly, we must put the hypercall page
into the .text segment so that it is executable.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
Cc: the arch/x86 maintainers <x86@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c      | 54 +++++++++++++++++++++++++++----------------------
 arch/x86/xen/xen-head.S |  2 +-
 2 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f2a67fe6ad..265601d5a6a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -179,46 +179,54 @@ out:
 		preempt_enable();
 }
 
-pteval_t xen_pte_val(pte_t pte)
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
 {
-	pteval_t ret = pte.pte;
+	if (val & _PAGE_PRESENT) {
+		unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & ~PTE_MASK;
+		val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+	}
 
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+	return val;
+}
 
-	return ret;
+static pteval_t pte_pfn_to_mfn(pteval_t val)
+{
+	if (val & _PAGE_PRESENT) {
+		unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & ~PTE_MASK;
+		val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+	}
+
+	return val;
+}
+
+pteval_t xen_pte_val(pte_t pte)
+{
+	return pte_mfn_to_pfn(pte.pte);
 }
 
 pgdval_t xen_pgd_val(pgd_t pgd)
 {
-	pgdval_t ret = pgd.pgd;
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-	return ret;
+	return pte_mfn_to_pfn(pgd.pgd);
 }
 
 pte_t xen_make_pte(pteval_t pte)
 {
-	if (pte & _PAGE_PRESENT)
-		pte = phys_to_machine(XPADDR(pte)).maddr;
-
-	return (pte_t){ .pte = pte };
+	pte = pte_pfn_to_mfn(pte);
+	return native_make_pte(pte);
 }
 
 pgd_t xen_make_pgd(pgdval_t pgd)
 {
-	if (pgd & _PAGE_PRESENT)
-		pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
-	return (pgd_t){ pgd };
+	pgd = pte_pfn_to_mfn(pgd);
+	return native_make_pgd(pgd);
 }
 
 pmdval_t xen_pmd_val(pmd_t pmd)
 {
-	pmdval_t ret = native_pmd_val(pmd);
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-	return ret;
+	return pte_mfn_to_pfn(pmd.pmd);
 }
 #ifdef CONFIG_X86_PAE
 void xen_set_pud(pud_t *ptr, pud_t val)
@@ -265,9 +273,7 @@ void xen_pmd_clear(pmd_t *pmdp)
 
 pmd_t xen_make_pmd(pmdval_t pmd)
 {
-	if (pmd & _PAGE_PRESENT)
-		pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
+	pmd = pte_pfn_to_mfn(pmd);
 	return native_make_pmd(pmd);
 }
 #else  /* !PAE */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73..3175e973fd0 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
 
 	__FINIT
 
-.pushsection .bss.page_aligned
+.pushsection .text
 	.align PAGE_SIZE_asm
 ENTRY(hypercall_page)
 	.skip 0x1000
-- 
cgit v1.2.3


From d4acf7e7abe45457e751525a2a4d5b693dfdd597 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 6 Jun 2008 16:37:35 -0300
Subject: KVM: Fix race between timer migration and vcpu migration

A guest vcpu instance can be scheduled to a different physical CPU
between the test for KVM_REQ_MIGRATE_TIMER and local_irq_disable().

If that happens, the timer will only be migrated to the current pCPU on
the next exit, meaning that guest LAPIC timer event can be delayed until
a host interrupt is triggered.

Fix it by cancelling guest entry if any vcpu request is pending.  This
has the side effect of nicely consolidating vcpu->requests checks.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a1..b90744a1dc3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2759,6 +2759,8 @@ again:
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+			kvm_x86_ops->tlb_flush(vcpu);
 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
 				       &vcpu->requests)) {
 			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2781,21 +2783,13 @@ again:
 
 	local_irq_disable();
 
-	if (need_resched()) {
+	if (vcpu->requests || need_resched()) {
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
 		goto out;
 	}
 
-	if (vcpu->requests)
-		if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
-			local_irq_enable();
-			preempt_enable();
-			r = 1;
-			goto out;
-		}
-
 	if (signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
@@ -2825,9 +2819,6 @@ again:
 
 	kvm_guest_enter();
 
-	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-			kvm_x86_ops->tlb_flush(vcpu);
 
 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
 	kvm_x86_ops->run(vcpu, kvm_run);
-- 
cgit v1.2.3


From 06e05645661211b9eaadaf6344c335d2e80f0ba2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 6 Jun 2008 16:37:36 -0300
Subject: KVM: close timer injection race window in __vcpu_run

If a timer fires after kvm_inject_pending_timer_irqs() but before
local_irq_disable() the code will enter guest mode and only inject such
timer interrupt the next time an unrelated event causes an exit.

It would be simpler if the timer->pending irq conversion could be done
with IRQ's disabled, so that the above problem cannot happen.

For now introduce a new vcpu requests bit to cancel guest entry.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 9 ++++++---
 arch/x86/kvm/lapic.c | 1 +
 arch/x86/kvm/x86.c   | 1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874..3829aa7b663 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	atomic_inc(&pt->pending);
 	smp_mb__after_atomic_inc();
-	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-		wake_up_interruptible(&vcpu0->wq);
+	if (vcpu0) {
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
+		if (waitqueue_active(&vcpu0->wq)) {
+			vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+			wake_up_interruptible(&vcpu0->wq);
+		}
 	}
 
 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba6..ebc03f5ae16 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 	wait_queue_head_t *q = &apic->vcpu->wq;
 
 	atomic_inc(&apic->timer.pending);
+	set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
 	if (waitqueue_active(q)) {
 		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		wake_up_interruptible(q);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b90744a1dc3..b08812d6b34 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2774,6 +2774,7 @@ again:
 		}
 	}
 
+	clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
 	kvm_inject_pending_timer_irqs(vcpu);
 
 	preempt_disable();
-- 
cgit v1.2.3


From 6597ca09e6c0e5aec7ffd2b8ab48c671d3c28414 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sun, 8 Jun 2008 01:48:53 -0300
Subject: KVM: MMU: Fix rmap_write_protect() hugepage iteration bug

rmap_next() does not work correctly after rmap_remove(), as it expects
the rmap chains not to change during iteration.  Fix (for now) by restarting
iteration from the beginning.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0..9628091c574 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 			rmap_remove(kvm, spte);
 			--kvm->stat.lpages;
 			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+			spte = NULL;
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
-- 
cgit v1.2.3


From 3094538739415a9225afd2a6c78cb0fe1c1f641b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 11 Jun 2008 20:32:40 -0300
Subject: KVM: MMU: large page update_pte issue with non-PAE 32-bit guests
 (resend)

kvm_mmu_pte_write() does not handle 32-bit non-PAE large page backed
guests properly. It will instantiate two 2MB sptes pointing to the same
physical 2MB page when a guest large pte update is trapped.

Instead of duplicating code to handle this, disallow directory level
updates to happen through kvm_mmu_pte_write(), so the two 2MB sptes
emulating one guest 4MB pte can be correctly created by the page fault
handling path.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9628091c574..baa6503894d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1581,11 +1581,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  u64 *spte,
 				  const void *new)
 {
-	if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
-	    && !vcpu->arch.update_pte.largepage) {
-		++vcpu->kvm->stat.mmu_pde_zapped;
-		return;
-	}
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+		if (!vcpu->arch.update_pte.largepage ||
+		    sp->role.glevels == PT32_ROOT_LEVEL) {
+			++vcpu->kvm->stat.mmu_pde_zapped;
+			return;
+		}
+        }
 
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (sp->role.glevels == PT32_ROOT_LEVEL)
-- 
cgit v1.2.3


From 6bf6a9532fd03ad719f0c86654f16ef777b78fc6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 12 Jun 2008 16:54:41 +0300
Subject: KVM: MMU: Fix oops on guest userspace access to guest pagetable

KVM has a heuristic to unshadow guest pagetables when userspace accesses
them, on the assumption that most guests do not allow userspace to access
pagetables directly. Unfortunately, in addition to unshadowing the pagetables,
it also oopses.

This never triggers on ordinary guests since sane OSes will clear the
pagetables before assigning them to userspace, which will trigger the flood
heuristic, unshadowing the pagetables before the first userspace access. One
particular guest, though (Xenner) will run the kernel in userspace, triggering
the oops.  Since the heuristic is incorrect in this case, we can simply
remove it.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index baa6503894d..7e7c3969f7a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1083,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		struct kvm_mmu_page *shadow;
 
 		spte |= PT_WRITABLE_MASK;
-		if (user_fault) {
-			mmu_unshadow(vcpu->kvm, gfn);
-			goto unshadowed;
-		}
 
 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
 		if (shadow ||
@@ -1103,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 
-unshadowed:
-
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-- 
cgit v1.2.3


From a9b21b622958afc3f3bc5a23d266dd9ed1171fd3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 24 Jun 2008 11:48:49 +0300
Subject: KVM: VMX: Fix host msr corruption with preemption enabled

Switching msrs can occur either synchronously as a result of calls to
the msr management functions (usually in response to the guest touching
virtualized msrs), or asynchronously when preempting a kvm thread that has
guest state loaded.  If we're unlucky enough to have the two at the same
time, host msrs are corrupted and the machine goes kaput on the next syscall.

Most easily triggered by Windows Server 2008, as it does a lot of msr
switching during bootup.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f31..540e9517907 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	load_transition_efer(vmx);
 }
 
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
 	unsigned long flags;
 
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	reload_host_efer(vmx);
 }
 
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+	preempt_disable();
+	__vmx_load_host_state(vmx);
+	preempt_enable();
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	vmx_load_host_state(to_vmx(vcpu));
+	__vmx_load_host_state(to_vmx(vcpu));
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	switch (msr_index) {
 #ifdef CONFIG_X86_64
 	case MSR_EFER:
+		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
-		if (vmx->host_state.loaded) {
-			reload_host_efer(vmx);
-			load_transition_efer(vmx);
-		}
 		break;
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		guest_write_tsc(data);
 		break;
 	default:
+		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			msr->data = data;
-			if (vmx->host_state.loaded)
-				load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 			break;
 		}
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
-- 
cgit v1.2.3


From 28499143933f19b28008a556ed59255d6009391a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 9 May 2008 12:05:57 +0100
Subject: xen: remove support for non-PAE 32-bit

Non-PAE operation has been deprecated in Xen for a while, and is
rarely tested or used.  xen-unstable has now officially dropped
non-PAE support.  Since Xen/pvops' non-PAE support has also been
broken for a while, we may as well completely drop it altogether.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/Kconfig     |  2 +-
 arch/x86/xen/enlighten.c | 51 +++++++++++++++++-------------------------------
 arch/x86/xen/mmu.c       | 19 ++----------------
 arch/x86/xen/mmu.h       | 24 ++++++-----------------
 arch/x86/xen/xen-head.S  |  4 ----
 5 files changed, 27 insertions(+), 73 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737..525b108411b 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
 	bool "Xen guest support"
 	select PARAVIRT
 	depends on X86_32
-	depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
+	depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c048de34d6a..f09c1c69c37 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+	int i;
 
 	/* special set_pte for pagetable initialization */
 	pv_mmu_ops.set_pte = xen_set_pte_init;
 
 	init_mm.pgd = base;
 	/*
-	 * copy top-level of Xen-supplied pagetable into place.	 For
-	 * !PAE we can use this as-is, but for PAE it is a stand-in
-	 * while we copy the pmd pages.
+	 * copy top-level of Xen-supplied pagetable into place.  This
+	 * is a stand-in while we copy the pmd pages.
 	 */
 	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
 
-	if (PTRS_PER_PMD > 1) {
-		int i;
-		/*
-		 * For PAE, need to allocate new pmds, rather than
-		 * share Xen's, since Xen doesn't like pmd's being
-		 * shared between address spaces.
-		 */
-		for (i = 0; i < PTRS_PER_PGD; i++) {
-			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+	/*
+	 * For PAE, need to allocate new pmds, rather than
+	 * share Xen's, since Xen doesn't like pmd's being
+	 * shared between address spaces.
+	 */
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+			pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 
-				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-				       PAGE_SIZE);
+			memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+			       PAGE_SIZE);
 
-				make_lowmem_page_readonly(pmd);
+			make_lowmem_page_readonly(pmd);
 
-				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-			} else
-				pgd_clear(&base[i]);
-		}
+			set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+		} else
+			pgd_clear(&base[i]);
 	}
 
 	/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 
 	/* Actually pin the pagetable down, but we can't set PG_pinned
 	   yet because the page structures don't exist yet. */
-	{
-		unsigned level;
-
-#ifdef CONFIG_X86_PAE
-		level = MMUEXT_PIN_L3_TABLE;
-#else
-		level = MMUEXT_PIN_L2_TABLE;
-#endif
-
-		pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
-	}
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
 }
 
 /* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.make_pte = xen_make_pte,
 	.make_pgd = xen_make_pgd,
 
-#ifdef CONFIG_X86_PAE
 	.set_pte_atomic = xen_set_pte_atomic,
 	.set_pte_present = xen_set_pte_at,
 	.set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.make_pmd = xen_make_pmd,
 	.pmd_val = xen_pmd_val,
-#endif	/* PAE */
 
 	.activate_mm = xen_activate_mm,
 	.dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 265601d5a6a..df40bf74ea7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -228,7 +228,7 @@ pmdval_t xen_pmd_val(pmd_t pmd)
 {
 	return pte_mfn_to_pfn(pmd.pmd);
 }
-#ifdef CONFIG_X86_PAE
+
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
 	struct multicall_space mcs;
@@ -276,12 +276,6 @@ pmd_t xen_make_pmd(pmdval_t pmd)
 	pmd = pte_pfn_to_mfn(pmd);
 	return native_make_pmd(pmd);
 }
-#else  /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-	*ptep = pte;
-}
-#endif	/* CONFIG_X86_PAE */
 
 /*
   (Yet another) pagetable walker.  This one is intended for pinning a
@@ -434,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
    read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	unsigned level;
-
 	xen_mc_batch();
 
 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -445,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_mc_batch();
 	}
 
-#ifdef CONFIG_X86_PAE
-	level = MMUEXT_PIN_L3_TABLE;
-#else
-	level = MMUEXT_PIN_L2_TABLE;
-#endif
-
-	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
-
+	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 	xen_mc_issue(0);
 }
 
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519..5fe961caffd 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
 void xen_pgd_pin(pgd_t *pgd);
 //void xen_pgd_unpin(pgd_t *pgd);
 
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
+pteval_t xen_pte_val(pte_t);
+pmdval_t xen_pmd_val(pmd_t);
+pgdval_t xen_pgd_val(pgd_t);
 
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
+pte_t xen_make_pte(pteval_t);
+pmd_t xen_make_pmd(pmdval_t);
+pgd_t xen_make_pgd(pgdval_t);
 
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void xen_pmd_clear(pmd_t *pmdp);
 
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
-
 #endif	/* _XEN_MMU_H */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 3175e973fd0..6ec3b4f7719 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
-#else
-	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
-#endif
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 
 #endif /*CONFIG_XEN */
-- 
cgit v1.2.3


From 7af192c954017499ec163bc9dbaaee2e593d7ef2 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 3 Jun 2008 16:17:29 +0200
Subject: x86: Add structs and functions for paravirt clocksource

This patch adds structs for the paravirt clocksource ABI
used by both xen and kvm (pvclock-abi.h).

It also adds some helper functions to read system time and
wall clock time from a paravirtual clocksource (pvclock.[ch]).
They are based on the xen code.  They are enabled using
CONFIG_PARAVIRT_CLOCK.

Subsequent patches of this series will put the code in use.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/Kconfig          |   4 ++
 arch/x86/kernel/Makefile  |   1 +
 arch/x86/kernel/pvclock.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 arch/x86/kernel/pvclock.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba..f94bca6ff47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -410,6 +410,10 @@ config PARAVIRT
 	  over full virtualization.  However, when run without a hypervisor
 	  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_CLOCK
+	bool
+	default n
+
 endif
 
 config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b472..77807d4769c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 00000000000..05fbe9a0325
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
+/*  paravirtual clock -- common code used by kvm/xen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+
+/*
+ * These are perodically updated
+ *    xen: magic shared_info page
+ *    kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	int tsc_shift;
+	u32 version;
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+					struct pvclock_vcpu_time_info *src)
+{
+	do {
+		dst->version = src->version;
+		rmb();		/* fetch version before data */
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();		/* test version after fetching data */
+	} while ((src->version & 1) || (dst->version != src->version));
+
+	return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+	struct pvclock_shadow_time shadow;
+	unsigned version;
+	cycle_t ret, offset;
+
+	do {
+		version = pvclock_get_time_values(&shadow, src);
+		barrier();
+		offset = pvclock_get_nsec_offset(&shadow);
+		ret = shadow.system_timestamp + offset;
+		barrier();
+	} while (version != src->version);
+
+	return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+			    struct pvclock_vcpu_time_info *vcpu_time,
+			    struct timespec *ts)
+{
+	u32 version;
+	u64 delta;
+	struct timespec now;
+
+	/* get wallclock at system boot */
+	do {
+		version = wall_clock->version;
+		rmb();		/* fetch version before time */
+		now.tv_sec  = wall_clock->sec;
+		now.tv_nsec = wall_clock->nsec;
+		rmb();		/* fetch time before checking version */
+	} while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+	delta = pvclock_clocksource_read(vcpu_time);	/* time since system boot */
+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+	now.tv_sec = delta;
+
+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
-- 
cgit v1.2.3


From 1c7b67f7576c4ca2a344379a4a29eec8fe8e7935 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 3 Jun 2008 16:17:30 +0200
Subject: x86: Make xen use the paravirt clocksource structs and functions

This patch updates the xen guest to use the pvclock structs
and helper functions.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/xen/Kconfig |   1 +
 arch/x86/xen/time.c  | 132 +++++----------------------------------------------
 2 files changed, 13 insertions(+), 120 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737..3a4f16aea4b 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
 config XEN
 	bool "Xen guest support"
 	select PARAVIRT
+	select PARAVIRT_CLOCK
 	depends on X86_32
 	depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
 	help
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e385698..41e217503c9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/math64.h>
 
+#include <asm/pvclock.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
@@ -31,17 +32,6 @@
 
 static cycle_t xen_clocksource_read(void);
 
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-	u32 tsc_to_nsec_mul;
-	int tsc_shift;
-	u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
 unsigned long xen_cpu_khz(void)
 {
 	u64 xen_khz = 1000000ULL << 32;
-	const struct vcpu_time_info *info =
+	const struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
 
 	do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
 	return xen_khz;
 }
 
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
-	struct vcpu_time_info   *src;
-	struct shadow_time_info *dst;
-
-	/* src is shared memory with the hypervisor, so we need to
-	   make sure we get a consistent snapshot, even in the face of
-	   being preempted. */
-	src = &__get_cpu_var(xen_vcpu)->time;
-	dst = &__get_cpu_var(shadow_time);
-
-	do {
-		dst->version = src->version;
-		rmb();		/* fetch version before data */
-		dst->tsc_timestamp     = src->tsc_timestamp;
-		dst->system_timestamp  = src->system_time;
-		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-		dst->tsc_shift         = src->tsc_shift;
-		rmb();		/* test version after fetching data */
-	} while ((src->version & 1) | (dst->version ^ src->version));
-
-	return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-	u64 product;
-#ifdef __i386__
-	u32 tmp1, tmp2;
-#endif
-
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
-
-#ifdef __i386__
-	__asm__ (
-		"mul  %5       ; "
-		"mov  %4,%%eax ; "
-		"mov  %%edx,%4 ; "
-		"mul  %5       ; "
-		"xor  %5,%5    ; "
-		"add  %4,%%eax ; "
-		"adc  %5,%%edx ; "
-		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
-		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
-	__asm__ (
-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
-		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
-	return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
-	u64 now, delta;
-	now = native_read_tsc();
-	delta = now - shadow->tsc_timestamp;
-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
 static cycle_t xen_clocksource_read(void)
 {
-	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        struct pvclock_vcpu_time_info *src;
 	cycle_t ret;
-	unsigned version;
-
-	do {
-		version = get_time_values_from_xen();
-		barrier();
-		ret = shadow->system_timestamp + get_nsec_offset(shadow);
-		barrier();
-	} while (version != __get_cpu_var(xen_vcpu)->time.version);
-
-	put_cpu_var(shadow_time);
 
+	src = &get_cpu_var(xen_vcpu)->time;
+	ret = pvclock_clocksource_read(src);
+	put_cpu_var(xen_vcpu);
 	return ret;
 }
 
 static void xen_read_wallclock(struct timespec *ts)
 {
-	const struct shared_info *s = HYPERVISOR_shared_info;
-	u32 version;
-	u64 delta;
-	struct timespec now;
-
-	/* get wallclock at system boot */
-	do {
-		version = s->wc_version;
-		rmb();		/* fetch version before time */
-		now.tv_sec  = s->wc_sec;
-		now.tv_nsec = s->wc_nsec;
-		rmb();		/* fetch time before checking version */
-	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct pvclock_wall_clock *wall_clock = &(s->wc);
+        struct pvclock_vcpu_time_info *vcpu_time;
 
-	delta = xen_clocksource_read();	/* time since system boot */
-	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
-	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
-	now.tv_sec = delta;
-
-	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+	vcpu_time = &get_cpu_var(xen_vcpu)->time;
+	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+	put_cpu_var(xen_vcpu);
 }
 
 unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
 	struct timespec ts;
 
 	xen_read_wallclock(&ts);
-
 	return ts.tv_sec;
 }
 
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
 
-	get_time_values_from_xen();
-
 	clocksource_register(&xen_clocksource);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
-- 
cgit v1.2.3


From 50d0a0f987b83a8dadb1134d834e35ec410392b5 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 3 Jun 2008 16:17:31 +0200
Subject: KVM: Make kvm host use the paravirt clocksource structs

This patch updates the kvm host code to use the pvclock structs.
It also makes the paravirt clock compatible with Xen.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 75 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b08812d6b34..63a77caa59f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
 	static int version;
-	struct kvm_wall_clock wc;
-	struct timespec wc_ts;
+	struct pvclock_wall_clock wc;
+	struct timespec now, sys, boot;
 
 	if (!wall_clock)
 		return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
-	wc_ts = current_kernel_time();
-	wc.wc_sec = wc_ts.tv_sec;
-	wc.wc_nsec = wc_ts.tv_nsec;
-	wc.wc_version = version;
+	/*
+	 * The guest calculates current wall clock time by adding
+	 * system time (updated by kvm_write_guest_time below) to the
+	 * wall clock specified here.  guest system time equals host
+	 * system time for us, thus we must fill in host boot time here.
+	 */
+	now = current_kernel_time();
+	ktime_get_ts(&sys);
+	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+
+	wc.sec = boot.tv_sec;
+	wc.nsec = boot.tv_nsec;
+	wc.version = version;
 
 	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
 
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+	uint32_t quotient, remainder;
+
+	/* Don't try to replace with do_div(), this one calculates
+	 * "(dividend << 32) / divisor" */
+	__asm__ ( "divl %4"
+		  : "=a" (quotient), "=d" (remainder)
+		  : "0" (0), "1" (dividend), "r" (divisor) );
+	return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+	uint64_t nsecs = 1000000000LL;
+	int32_t  shift = 0;
+	uint64_t tps64;
+	uint32_t tps32;
+
+	tps64 = tsc_khz * 1000LL;
+	while (tps64 > nsecs*2) {
+		tps64 >>= 1;
+		shift--;
+	}
+
+	tps32 = (uint32_t)tps64;
+	while (tps32 <= (uint32_t)nsecs) {
+		tps32 <<= 1;
+		shift++;
+	}
+
+	hv_clock->tsc_shift = shift;
+	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+		 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+		 hv_clock->tsc_to_system_mul);
+}
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
+	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = tsc_khz;
+	}
+
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
-	 * state, we just write "2" at the end
+	 * state, we just increase by 2 at the end.
 	 */
-	vcpu->hv_clock.version = 2;
+	vcpu->hv_clock.version += 2;
 
 	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 
 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
-		sizeof(vcpu->hv_clock));
+	       sizeof(vcpu->hv_clock));
 
 	kunmap_atomic(shared_kaddr, KM_USER0);
 
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		/* ...but clean it before doing the actual write */
 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
-		vcpu->arch.hv_clock.tsc_to_system_mul =
-					clocksource_khz2mult(tsc_khz, 22);
-		vcpu->arch.hv_clock.tsc_shift = 22;
-
 		down_read(&current->mm->mmap_sem);
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-- 
cgit v1.2.3


From f6e16d5ad463d15f285666f588cfe49495c692d9 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 3 Jun 2008 16:17:32 +0200
Subject: x86: KVM guest: Use the paravirt clocksource structs and functions

This patch updates the kvm host code to use the pvclock structs
and functions, thereby making it compatible with Xen.

The patch also fixes an initialization bug: on SMP systems the
per-cpu has two different locations early at boot and after CPU
bringup.  kvmclock must take that in account when registering the
physical address within the host.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/kvmclock.c | 89 +++++++++++++++++-----------------------------
 2 files changed, 34 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f94bca6ff47..e0edaaa6920 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
 config KVM_CLOCK
 	bool "KVM paravirtualized clock"
 	select PARAVIRT
+	select PARAVIRT_CLOCK
 	depends on !(X86_VISWS || X86_VOYAGER)
 	help
 	  Turning on this option will allow you to run a paravirtualized clock
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d47..87edf1ceb1d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
 
 #include <linux/clocksource.h>
 #include <linux/kvm_para.h>
+#include <asm/pvclock.h>
 #include <asm/arch_hooks.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_wall_clock wall_clock;
 
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
-	int cpu = smp_processor_id();
-	u64 delta = native_read_tsc() - last_tsc;
-	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
  */
 static unsigned long kvm_get_wallclock(void)
 {
-	u32 wc_sec, wc_nsec;
-	u64 delta;
+	struct pvclock_vcpu_time_info *vcpu_time;
 	struct timespec ts;
-	int version, nsec;
 	int low, high;
 
 	low = (int)__pa(&wall_clock);
 	high = ((u64)__pa(&wall_clock) >> 32);
+	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
 
-	delta = kvm_clock_read();
+	vcpu_time = &get_cpu_var(hv_clock);
+	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+	put_cpu_var(hv_clock);
 
-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
-	do {
-		version = wall_clock.wc_version;
-		rmb();
-		wc_sec = wall_clock.wc_sec;
-		wc_nsec = wall_clock.wc_nsec;
-		rmb();
-	} while ((wall_clock.wc_version != version) || (version & 1));
-
-	delta = kvm_clock_read() - delta;
-	delta += wc_nsec;
-	nsec = do_div(delta, NSEC_PER_SEC);
-	set_normalized_timespec(&ts, wc_sec + delta, nsec);
-	/*
-	 * Of all mechanisms of time adjustment I've tested, this one
-	 * was the champion!
-	 */
-	return ts.tv_sec + 1;
+	return ts.tv_sec;
 }
 
 static int kvm_set_wallclock(unsigned long now)
 {
-	return 0;
+	return -1;
 }
 
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
 static cycle_t kvm_clock_read(void)
 {
-	u64 last_tsc, now;
-	int cpu;
+	struct pvclock_vcpu_time_info *src;
+	cycle_t ret;
 
-	preempt_disable();
-	cpu = smp_processor_id();
-
-	last_tsc = get_clock(cpu, tsc_timestamp);
-	now = get_clock(cpu, system_time);
-
-	now += kvm_get_delta(last_tsc);
-	preempt_enable();
-
-	return now;
+	src = &get_cpu_var(hv_clock);
+	ret = pvclock_clocksource_read(src);
+	put_cpu_var(hv_clock);
+	return ret;
 }
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
 	int low, high;
 	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
-
+	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+	       cpu, high, low, txt);
 	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
 
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
 	 * Now that the first cpu already had this clocksource initialized,
 	 * we shouldn't fail.
 	 */
-	WARN_ON(kvm_register_clock());
+	WARN_ON(kvm_register_clock("secondary cpu clock"));
 	/* ok, done with our trickery, call native */
 	setup_secondary_APIC_clock();
 }
 #endif
 
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+	WARN_ON(kvm_register_clock("primary cpu clock"));
+	native_smp_prepare_boot_cpu();
+}
+#endif
+
 /*
  * After the clock is registered, the host will keep writing to the
  * registered memory location. If the guest happens to shutdown, this memory
@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
 		return;
 
 	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-		if (kvm_register_clock())
+		if (kvm_register_clock("boot clock"))
 			return;
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
 #ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+#endif
+#ifdef CONFIG_SMP
+		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
 		machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
-- 
cgit v1.2.3


From 0b1faeef5f9243bb5fc5713a34bbf1ceab0de562 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Sun, 15 Jun 2008 12:32:15 +0100
Subject: x86: section/warning fixes

WARNING: arch/x86/mm/built-in.o(.text+0x3a1): Section mismatch in
reference from the function set_pte_phys() to the function
.init.text:spp_getpage()
The function set_pte_phys() references
the function __init spp_getpage().
This is often because set_pte_phys lacks a __init
annotation or the annotation of spp_getpage is wrong.

arch/x86/mm/init_64.c: In function 'early_memtest':
arch/x86/mm/init_64.c:520: warning: passing argument 2 of
'find_e820_area_size' from incompatible pointer type

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e3..f6d20be7a8f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void)
 	return ptr;
 }
 
-static void
+static __init void
 set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 {
 	pgd_t *pgd;
@@ -214,7 +214,7 @@ void __init cleanup_highmap(void)
 }
 
 /* NOTE: this is meant to be run only at boot */
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 	unsigned long address = __fix_to_virt(idx);
 
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
 
 static void __init early_memtest(unsigned long start, unsigned long end)
 {
-	u64 t_start, t_size;
+	unsigned long t_start, t_size;
 	unsigned pattern;
 
 	if (!memtest_pattern)
@@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			if (t_start + t_size > end)
 				t_size = end - t_start;
 
-			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
+			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
 				t_start, t_start + t_size, pattern);
 
 			memtest(t_start, t_size, pattern);
-- 
cgit v1.2.3


From fcb43042ef55d2f46b0efa5d7746967cef38f056 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 24 Jun 2008 16:06:23 +0800
Subject: x86: fix cpu hotplug crash

Vegard Nossum reported crashes during cpu hotplug tests:

  http://marc.info/?l=linux-kernel&m=121413950227884&w=4

In function _cpu_up, the panic happens when calling
__raw_notifier_call_chain at the second time. Kernel doesn't panic when
calling it at the first time. If just say because of nr_cpu_ids, that's
not right.

By checking the source code, I found that function do_boot_cpu is the culprit.
Consider below call chain:
 _cpu_up=>__cpu_up=>smp_ops.cpu_up=>native_cpu_up=>do_boot_cpu.

So do_boot_cpu is called in the end. In do_boot_cpu, if
boot_error==true, cpu_clear(cpu, cpu_possible_map) is executed. So later
on, when _cpu_up calls __raw_notifier_call_chain at the second time to
report CPU_UP_CANCELED, because this cpu is already cleared from
cpu_possible_map, get_cpu_sysdev returns NULL.

Many resources are related to cpu_possible_map, so it's better not to
change it.

Below patch against 2.6.26-rc7 fixes it by removing the bit clearing in
cpu_possible_map.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 56078d61c79..3e1cecedde4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -996,7 +996,6 @@ do_rest:
 #endif
 		cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
 		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
-		cpu_clear(cpu, cpu_possible_map);
 		cpu_clear(cpu, cpu_present_map);
 		per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
 	}
-- 
cgit v1.2.3


From 11dbc963a8f6128595d0f6ecf138dc369e144997 Mon Sep 17 00:00:00 2001
From: TAKADA Yoshihito <takada@mbf.nifty.com>
Date: Mon, 30 Jun 2008 13:44:45 +0900
Subject: ptrace GET/SET FPXREGS broken

When I update kernel 2.6.25 from 2.6.24, gdb does not work.
On 2.6.25, ptrace(PTRACE_GETFPXREGS, ...) returns ENODEV.

But 2.6.24 kernel's ptrace() returns EIO.
It is issue of compatibility.

I attached test program as pt.c and patch for fix it.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <errno.h>
#include <sys/ptrace.h>
#include <sys/types.h>

struct user_fxsr_struct {
	unsigned short	cwd;
	unsigned short	swd;
	unsigned short	twd;
	unsigned short	fop;
	long	fip;
	long	fcs;
	long	foo;
	long	fos;
	long	mxcsr;
	long	reserved;
	long	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
	long	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
	long	padding[56];
};

int main(void)
{
  pid_t pid;

  pid = fork();

  switch(pid){
  case -1:/*  error */
    break;
  case 0:/*  child */
    child();
    break;
  default:
    parent(pid);
    break;
  }
  return 0;
}

int child(void)
{
  ptrace(PTRACE_TRACEME);
  kill(getpid(), SIGSTOP);
  sleep(10);
  return 0;
}
int parent(pid_t pid)
{
  int ret;
  struct user_fxsr_struct fpxregs;

  ret = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpxregs);
  if(ret < 0){
    printf("%d: %s.\n", errno, strerror(errno));
  }
  kill(pid, SIGCONT);
  wait(pid);
  return 0;
}

/* in the kerel, at kernel/i387.c get_fpxregs() */

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i387.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb8..95e80e5033c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -ENODEV;
+		return -EIO;
 
 	ret = init_fpu(target);
 	if (ret)
@@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -ENODEV;
+		return -EIO;
 
 	ret = init_fpu(target);
 	if (ret)
-- 
cgit v1.2.3


From efac41894df57d32b483ac622d03541b5b2692c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Jul 2008 08:56:32 +0200
Subject: x86: fix NODES_SHIFT Kconfig range

commit 4323838215184f5a2f081e0d17b8d60731b03164
       x86: change size of node ids from u8 to s16

set the range for NODES_SHIFT to 1..15.

The possible range is 1..9

Fixes Bugzilla #10726

Reported-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba..8a07f417f5f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -961,8 +961,8 @@ config NUMA_EMU
 	  number of nodes. This is only useful for debugging.
 
 config NODES_SHIFT
-	int "Max num nodes shift(1-15)"
-	range 1 15  if X86_64
+	int "Max num nodes shift(1-9)"
+	range 1 9  if X86_64
 	default "6" if X86_64
 	default "4" if X86_NUMAQ
 	default "3"
-- 
cgit v1.2.3


From 216705d2720dedf630b55d641737f430ead0c228 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 2 Jul 2008 22:48:03 +0100
Subject: x86: fix Intel Mac booting with EFI

Fedora reports that mem_init()'s zap_low_mappings(), extended to SMP in
61165d7a035f6571c7576e7f51e7230157724c8d x86: fix app crashes after SMP
resume causes 32-bit Intel Mac machines to reboot very early when
booting with EFI.

The EFI code appears to manage low mappings for itself when needed; but
like many before it, confuses PSE with PAE.  So it has only been mapping
half the space it needed when PSE but not PAE.  This remained unnoticed
until we moved the SMP zap_low_mappings() before
efi_enter_virtual_mode().  Presumably could have been noticed years ago
if anyone ran a UP kernel on such machines?

Reported-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Peter Jones <pjones@redhat.com>
Cc: Glauber Costa <gcosta@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Peter Jones <pjones@redhat.com>
---
 arch/x86/kernel/efi_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 5d23d85624d..4b63c8e1f13 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void)
 	local_irq_save(efi_rt_eflags);
 
 	/*
-	 * If I don't have PSE, I should just duplicate two entries in page
-	 * directory. If I have PSE, I just need to duplicate one entry in
+	 * If I don't have PAE, I should just duplicate two entries in page
+	 * directory. If I have PAE, I just need to duplicate one entry in
 	 * page directory.
 	 */
 	cr4 = read_cr4();
 
-	if (cr4 & X86_CR4_PSE) {
+	if (cr4 & X86_CR4_PAE) {
 		efi_bak_pg_dir_pointer[0].pgd =
 		    swapper_pg_dir[pgd_index(0)].pgd;
 		swapper_pg_dir[0].pgd =
@@ -93,7 +93,7 @@ void efi_call_phys_epilog(void)
 
 	cr4 = read_cr4();
 
-	if (cr4 & X86_CR4_PSE) {
+	if (cr4 & X86_CR4_PAE) {
 		swapper_pg_dir[pgd_index(0)].pgd =
 		    efi_bak_pg_dir_pointer[0].pgd;
 	} else {
-- 
cgit v1.2.3


From 27df66a406a171308b138bd84938cb735392e15c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 3 Jul 2008 10:14:10 +0200
Subject: arch/x86/mm/init_64.c: early_memtest(): fix types

fix this warning:

arch/x86/mm/init_64.c: In function 'early_memtest':
arch/x86/mm/init_64.c:524: warning: passing argument 2 of 'find_e820_area_size' from incompatible pointer type

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6d20be7a8f..819dad973b1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
 
 static void __init early_memtest(unsigned long start, unsigned long end)
 {
-	unsigned long t_start, t_size;
+	u64 t_start, t_size;
 	unsigned pattern;
 
 	if (!memtest_pattern)
@@ -525,8 +525,9 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			if (t_start + t_size > end)
 				t_size = end - t_start;
 
-			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
-				t_start, t_start + t_size, pattern);
+			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
+				(unsigned long long)t_start,
+				(unsigned long long)t_start + t_size, pattern);
 
 			memtest(t_start, t_size, pattern);
 
-- 
cgit v1.2.3


From d8355aca23863be659ec5b7e0393cfbfa91ec221 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 3 Jul 2008 22:10:18 -0700
Subject: xen: fix address truncation in pte mfn<->pfn conversion

When converting the page number in a pte/pmd/pud/pgd between
machine and pseudo-physical addresses, the converted result was
being truncated at 32-bits.  This caused failures on machines
with more than 4G of physical memory.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: "Christopher S. Aker" <caker@theshore.net>
Cc: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df40bf74ea7..4e527e7893a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -185,7 +185,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & ~PTE_MASK;
-		val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 	}
 
 	return val;
@@ -196,7 +196,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & ~PTE_MASK;
-		val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 	}
 
 	return val;
-- 
cgit v1.2.3


From 4b4f7280d7fd1feeff134c2cf2db32fd583b6c29 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 24 Jun 2008 23:03:48 +0200
Subject: x86 ACPI: normalize segment descriptor register on resume

Some Dell laptops enter resume with apparent garbage in the segment
descriptor registers (almost certainly the result of a botched
transition from protected to real mode.)  The only way to clean that
up is to enter protected mode ourselves and clean out the descriptor
registers.

This fixes resume on Dell XPS M1210 and Dell D620.

Reference: http://bugzilla.kernel.org/show_bug.cgi?id=10927

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: pm list <linux-pm@lists.linux-foundation.org>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/realmode/wakeup.S | 38 +++++++++++++++++++++++++++++++++-
 arch/x86/kernel/acpi/realmode/wakeup.h |  5 +++++
 arch/x86/kernel/acpi/sleep.c           | 16 +++++++++++++-
 3 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index f9b77fb37e5..3355973b12a 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -5,6 +5,7 @@
 #include <asm/msr-index.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/processor-flags.h>
 
 	.code16
 	.section ".header", "a"
@@ -24,6 +25,11 @@ pmode_gdt:	.quad	0
 realmode_flags:	.long	0
 real_magic:	.long	0
 trampoline_segment:	.word 0
+_pad1:		.byte	0
+wakeup_jmp:	.byte	0xea	/* ljmpw */
+wakeup_jmp_off:	.word	3f
+wakeup_jmp_seg:	.word	0
+wakeup_gdt:	.quad	0, 0, 0
 signature:	.long	0x51ee1111
 
 	.text
@@ -34,11 +40,34 @@ _start:
 	cli
 	cld
 
+	/* Apparently some dimwit BIOS programmers don't know how to
+	   program a PM to RM transition, and we might end up here with
+	   junk in the data segment descriptor registers.  The only way
+	   to repair that is to go into PM and fix it ourselves... */
+	movw	$16, %cx
+	lgdtl	%cs:wakeup_gdt
+	movl	%cr0, %eax
+	orb	$X86_CR0_PE, %al
+	movl	%eax, %cr0
+	jmp	1f
+1:	ljmpw	$8, $2f
+2:
+	movw	%cx, %ds
+	movw	%cx, %es
+	movw	%cx, %ss
+	movw	%cx, %fs
+	movw	%cx, %gs
+
+	andb	$~X86_CR0_PE, %al
+	movl	%eax, %cr0
+	jmp	wakeup_jmp
+3:
 	/* Set up segments */
 	movw	%cs, %ax
 	movw	%ax, %ds
 	movw	%ax, %es
 	movw	%ax, %ss
+	lidtl	wakeup_idt
 
 	movl	$wakeup_stack_end, %esp
 
@@ -98,7 +127,14 @@ bogus_real_magic:
 	jmp	1b
 
 	.data
-	.balign	4
+	.balign	8
+
+	/* This is the standard real-mode IDT */
+wakeup_idt:
+	.word	0xffff		/* limit */
+	.long	0		/* address */
+	.word	0
+
 	.globl	HEAP, heap_end
 HEAP:
 	.long	wakeup_heap
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index ef8166fe802..69d38d0b2b6 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -24,6 +24,11 @@ struct wakeup_header {
 	u32 realmode_flags;
 	u32 real_magic;
 	u16 trampoline_segment;	/* segment with trampoline code, 64-bit only */
+	u8  _pad1;
+	u8  wakeup_jmp;
+	u16 wakeup_jmp_off;
+	u16 wakeup_jmp_seg;
+	u64 wakeup_gdt[3];
 	u32 signature;		/* To check we have correct structure */
 } __attribute__((__packed__));
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964..36af01f029e 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -50,6 +50,20 @@ int acpi_save_state_mem(void)
 
 	header->video_mode = saved_video_mode;
 
+	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+	/* GDT[0]: GDT self-pointer */
+	header->wakeup_gdt[0] =
+		(u64)(sizeof(header->wakeup_gdt) - 1) +
+		((u64)(acpi_wakeup_address +
+			((char *)&header->wakeup_gdt - (char *)acpi_realmode))
+				<< 16);
+	/* GDT[1]: real-mode-like code segment */
+	header->wakeup_gdt[1] = (0x009bULL << 40) +
+		((u64)acpi_wakeup_address << 16) + 0xffff;
+	/* GDT[2]: real-mode-like data segment */
+	header->wakeup_gdt[2] = (0x0093ULL << 40) +
+		((u64)acpi_wakeup_address << 16) + 0xffff;
+
 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
 
@@ -111,7 +125,7 @@ void __init acpi_reserve_bootmem(void)
 		return;
 	}
 
-	acpi_wakeup_address = acpi_realmode;
+	acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
 }
 
 
-- 
cgit v1.2.3


From 64e83b5a919a65eb35b63dd7e07c188379ff8ce6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 5 Jul 2008 00:05:30 +0200
Subject: x86 ACPI: fix resume from suspend to RAM on uniprocessor x86-64

Since the trampoline code is now used for ACPI resume from suspend to RAM,
the trampoline page tables have to be fixed up during boot not only on SMP
systems, but also on UP systems that use the trampoline.

Reference: http://bugzilla.kernel.org/show_bug.cgi?id=10923

Reported-by: Dionisus Torimens <djtm@gmx.net>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: pm list <linux-pm@lists.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d..b817974ef94 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -128,7 +128,7 @@ ident_complete:
 	/* Fixup phys_base */
 	addq	%rbp, phys_base(%rip)
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_TRAMPOLINE
 	addq	%rbp, trampoline_level4_pgt + 0(%rip)
 	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
 #endif
-- 
cgit v1.2.3


From 739db07f82767e7634176d18af2acbe77b11fd42 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@hobbes.(none)>
Date: Mon, 7 Jul 2008 09:55:26 -0700
Subject: Revert "PCI: Correct last two HP entries in the bfsort whitelist"

This reverts commit a1676072558854b95336c8f7db76b0504e909a0a.  It duplicates
the change from 8d64c781f0c5fbfdf8016bd1634506ff2ad1376a and only one should be
applied, otherwise some of the Dell quirks are lost.

Thanks to Tony Camuso for catching this.

Acked-by: Tony Camuso <tcamuso@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 940185ecaed..6e64aaf00d1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -328,18 +328,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 #endif
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL360",
+		.ident = "HP ProLiant DL385 G2",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
 		},
 	},
 	{
 		.callback = set_bf_sort,
-		.ident = "HP ProLiant DL380",
+		.ident = "HP ProLiant DL585 G2",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
 		},
 	},
 	{}
-- 
cgit v1.2.3


From a361ee5cb8011763ece7b4add393e206439db8b3 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 10 Jul 2008 10:09:59 +0200
Subject: x86: fix /dev/mem compatibility under PAT

Add ioremap_default(), which gives a sane mapping without worrying about
type conflicts.

Use it in /dev/mem read in place of ioremap(), as with ioremap(),
any mapping of the region (other than UC_MINUS) will cause a conflict
and failure of /dev/mem read.

Should address the vbetest failure reported at:

  http://bugzilla.kernel.org/show_bug.cgi?id=11057

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2b2bb3f9b68..d1b867101e5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -300,6 +300,29 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_cache);
 
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+					unsigned long size)
+{
+	unsigned long flags;
+	void *ret;
+	int err;
+
+	/*
+	 * - WB for WB-able memory and no other conflicting mappings
+	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+	 * - Inherit from confliting mappings otherwise
+	 */
+	err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+	if (err < 0)
+		return NULL;
+
+	ret = (void *) __ioremap_caller(phys_addr, size, flags,
+					__builtin_return_address(0));
+
+	free_memtype(phys_addr, phys_addr + size);
+	return (void __iomem *)ret;
+}
+
 /**
  * iounmap - Free a IO remapping
  * @addr: virtual address from ioremap_*
@@ -365,7 +388,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	addr = (void *)ioremap(start, PAGE_SIZE);
+	addr = (void *)ioremap_default(start, PAGE_SIZE);
 	if (addr)
 		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
 
-- 
cgit v1.2.3


From b10e9ad0f1d0dc62bd444dd6761a6527bfe98959 Mon Sep 17 00:00:00 2001
From: Daniel Guilak <guilak@linux.vnet.ibm.com>
Date: Thu, 10 Jul 2008 09:39:32 -0700
Subject: arch/x86/kernel/.gitignore: Added vmlinux.lds to .gitignore file
 because it shouldn't be tracked.

Signed-off-by: Daniel Guilak <daniel@danielguilak.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 4ea38a39aed..08f4fd73146 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,2 +1,3 @@
 vsyscall.lds
 vsyscall_32.lds
+vmlinux.lds
-- 
cgit v1.2.3