From 54c28d294c658abb6d6430a49fda230fdfd601c8 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 3 Apr 2009 15:39:42 -0500
Subject: x86, uv: add Kconfig dependency on NUMA for UV systems

Impact: build fix

Add Kconfig dependency on NUMA for enabling UV. Although it might
be possible to configure non-NUMA UV systems, they are unsupported
and not interesting. Much of the infrastructure for UV requires
NUMA support.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090403203942.GA20137@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 748e50a1a15..2817ab5a120 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -354,6 +354,7 @@ config X86_UV
 	bool "SGI Ultraviolet"
 	depends on X86_64
 	depends on X86_EXTENDED_PLATFORM
+	depends on NUMA
 	select X86_X2APIC
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
-- 
cgit v1.2.3


From ca713c2ab0eea3458962983e4a7e13430ea479b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 15 Apr 2009 18:39:13 -0700
Subject: x86/irq: mark NUMA_MIGRATE_IRQ_DESC broken

It causes crash on system with lots of cards with MSI-X
when irq_balancer enabled...

The patches fixing it were both complex and fragile, according
to Eric they were also doing quite dangerous things to the
hardware.

Instead we now have patches that solve this problem via static
NUMA node mappings - not dynamic allocation and balancing.

The patches are much simpler than this method but are still too
large outside of the merge window, so we mark the dynamic balancer
as broken for now, and queue up the new approach for v2.6.31.

[ Impact: deactivate broken kernel feature ]

Reported-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49E68C41.4020801@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f5e4c..b1d2be7d91a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,6 +277,7 @@ config SPARSE_IRQ
 config NUMA_MIGRATE_IRQ_DESC
 	bool "Move irq desc when changing irq smp_affinity"
 	depends on SPARSE_IRQ && NUMA
+	depends on BROKEN
 	default n
 	---help---
 	  This enables moving irq_desc to cpu/node that irq will use handled.
-- 
cgit v1.2.3


From 2a3313f494c2f3f74a27d66f0f14b38558b7dba2 Mon Sep 17 00:00:00 2001
From: "Michael K. Johnson" <johnsonm@rpath.com>
Date: Tue, 21 Apr 2009 21:44:48 -0400
Subject: x86: more than 8 32-bit CPUs requires X86_BIGSMP

$ cat x86-more-than-8-cpus-requires-bigsmp.patch

Enforce NR_CPUS <= 8 limitation if X86_BIGSMP not set

Configuring more than 8 logical CPUs on 32-bit x86 requires
X86_BIGSMP to be set in order to boot successfully, if more than 8
logical CPUs are actually found at boot time.  The X86_BIGSMP help
text describes that it is required to be set if more than 8 CPUs
are configured, but this was previously not enforced.

This configuration error has affected multiple distributions:
    https://bugzilla.redhat.com/show_bug.cgi?id=480844
    https://issues.rpath.com/browse/RPL-3022

Signed-off-by: Michael K Johnson <johnsonm@rpath.com>
LKML-Reference: <20090422014448.GB32541@logo.rdu.rpath.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9086e6307a..b5cda6c03d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -664,6 +664,7 @@ config MAXSMP
 
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
+	range 2 8 if SMP && X86_32 && !X86_BIGSMP
 	range 2 512 if SMP && !MAXSMP
 	default "1" if !SMP
 	default "4096" if MAXSMP
-- 
cgit v1.2.3


From b4ecc126991b30fe5f9a59dfacda046aeac124b2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 13 May 2009 17:16:55 -0700
Subject: x86: Fix performance regression caused by paravirt_ops on native
 kernels

Xiaohui Xin and some other folks at Intel have been looking into what's
behind the performance hit of paravirt_ops when running native.

It appears that the hit is entirely due to the paravirtualized
spinlocks introduced by:

 | commit 8efcbab674de2bee45a2e4cdf97de16b8e609ac8
 | Date:   Mon Jul 7 12:07:51 2008 -0700
 |
 |     paravirt: introduce a "lock-byte" spinlock implementation

The extra call/return in the spinlock path is somehow
causing an increase in the cycles/instruction of somewhere around 2-7%
(seems to vary quite a lot from test to test).  The working theory is
that the CPU's pipeline is getting upset about the
call->call->locked-op->return->return, and seems to be failing to
speculate (though I haven't seen anything definitive about the precise
reasons).  This doesn't entirely make sense, because the performance
hit is also visible on unlock and other operations which don't involve
locked instructions.  But spinlock operations clearly swamp all the
other pvops operations, even though I can't imagine that they're
nearly as common (there's only a .05% increase in instructions
executed).

If I disable just the pv-spinlock calls, my tests show that pvops is
identical to non-pvops performance on native (my measurements show that
it is actually about .1% faster, but Xiaohui shows a .05% slowdown).

Summary of results, averaging 10 runs of the "mmperf" test, using a
no-pvops build as baseline:

		nopv		Pv-nospin	Pv-spin
CPU cycles	100.00%		99.89%		102.18%
instructions	100.00%		100.10%		100.15%
CPI		100.00%		99.79%		102.03%
cache ref	100.00%		100.84%		100.28%
cache miss	100.00%		90.47%		88.56%
cache miss rate	100.00%		89.72%		88.31%
branches	100.00%		99.93%		100.04%
branch miss	100.00%		103.66%		107.72%
branch miss rt	100.00%		103.73%		107.67%
wallclock	100.00%		99.90%		102.20%

The clear effect here is that the 2% increase in CPI is
directly reflected in the final wallclock time.

(The other interesting effect is that the more ops are
out of line calls via pvops, the lower the cache access
and miss rates.  Not too surprising, but it suggests that
the non-pvops kernel is over-inlined.  On the flipside,
the branch misses go up correspondingly...)

So, what's the fix?

Paravirt patching turns all the pvops calls into direct calls, so
_spin_lock etc do end up having direct calls.  For example, the compiler
generated code for paravirtualized _spin_lock is:

<_spin_lock+0>:		mov    %gs:0xb4c8,%rax
<_spin_lock+9>:		incl   0xffffffffffffe044(%rax)
<_spin_lock+15>:	callq  *0xffffffff805a5b30
<_spin_lock+22>:	retq

The indirect call will get patched to:
<_spin_lock+0>:		mov    %gs:0xb4c8,%rax
<_spin_lock+9>:		incl   0xffffffffffffe044(%rax)
<_spin_lock+15>:	callq <__ticket_spin_lock>
<_spin_lock+20>:	nop; nop		/* or whatever 2-byte nop */
<_spin_lock+22>:	retq

One possibility is to inline _spin_lock, etc, when building an
optimised kernel (ie, when there's no spinlock/preempt
instrumentation/debugging enabled).  That will remove the outer
call/return pair, returning the instruction stream to a single
call/return, which will presumably execute the same as the non-pvops
case.  The downsides arel 1) it will replicate the
preempt_disable/enable code at eack lock/unlock callsite; this code is
fairly small, but not nothing; and 2) the spinlock definitions are
already a very heavily tangled mass of #ifdefs and other preprocessor
magic, and making any changes will be non-trivial.

The other obvious answer is to disable pv-spinlocks.  Making them a
separate config option is fairly easy, and it would be trivial to
enable them only when Xen is enabled (as the only non-default user).
But it doesn't really address the common case of a distro build which
is going to have Xen support enabled, and leaves the open question of
whether the native performance cost of pv-spinlocks is worth the
performance improvement on a loaded Xen system (10% saving of overall
system CPU when guests block rather than spin).  Still it is a
reasonable short-term workaround.

[ Impact: fix pvops performance regression when running native ]

Analysed-by: "Xin Xiaohui" <xiaohui.xin@intel.com>
Analysed-by: "Li Xin" <xin.li@intel.com>
Analysed-by: "Nakajima Jun" <jun.nakajima@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Xen-devel <xen-devel@lists.xensource.com>
LKML-Reference: <4A0B62F7.5030802@goop.org>
[ fixed the help text ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee1..a6efe0a2e9a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -498,6 +498,19 @@ config PARAVIRT
 	  over full virtualization.  However, when run without a hypervisor
 	  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_SPINLOCKS
+	bool "Paravirtualization layer for spinlocks"
+	depends on PARAVIRT && SMP && EXPERIMENTAL
+	---help---
+	  Paravirtualized spinlocks allow a pvops backend to replace the
+	  spinlock implementation with something virtualization-friendly
+	  (for example, block the virtual CPU rather than spinning).
+
+	  Unfortunately the downside is an up to 5% performance hit on
+	  native kernels, with various workloads.
+
+	  If you are unsure how to answer this question, answer N.
+
 config PARAVIRT_CLOCK
 	bool
 	default n
-- 
cgit v1.2.3