From 74d4affde8feb8d5bdebf7fba8e90e4eae3b7b1d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:50 -0700 Subject: x86/paravirt: add hooks for spinlock operations Ticket spinlocks have absolutely ghastly worst-case performance characteristics in a virtual environment. If there is any contention for physical CPUs (ie, there are more runnable vcpus than cpus), then ticket locks can cause the system to end up spending 90+% of its time spinning. The problem is that (v)cpus waiting on a ticket spinlock will be granted access to the lock in strict order they got their tickets. If the hypervisor scheduler doesn't give the vcpus time in that order, they will burn timeslices waiting for the scheduler to give the right vcpu some time. In the worst case it could take O(n^2) vcpu scheduler timeslices for everyone waiting on the lock to get it, not counting new cpus trying to take the lock while the log-jam is sorted out. These hooks allow a paravirt backend to replace the spinlock implementation. At the very least, this could revert the implementation back to the old lock algorithm, which allows the next scheduled vcpu to take the lock, and has basically fairly good performance. It also allows the spinlocks to take advantages of the hypervisor features to make locks more efficient (spin and block, for example). The cost to native execution is an extra direct call when using a spinlock function. There's no overhead if CONFIG_PARAVIRT is turned off. The lock structure is fixed at a single "unsigned int", initialized to zero, but the spinlock implementation can use it as it wishes. Thanks to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around" for pointing out this problem. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 2963ab5d91e..f3381686870 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -124,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -450,6 +451,15 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; +struct pv_lock_ops pv_lock_ops = { + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +}; + EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); -- cgit v1.2.3 From 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:51 -0700 Subject: paravirt: introduce a "lock-byte" spinlock implementation Implement a version of the old spinlock algorithm, in which everyone spins waiting for a lock byte. In order to be compatible with the ticket-lock's use of a zero initializer, this uses the convention of '0' for unlocked and '1' for locked. This algorithm is much better than ticket locks in a virtual envionment, because it doesn't interact badly with the vcpu scheduler. If there are multiple vcpus spinning on a lock and the lock is released, the next vcpu to be scheduled will take the lock, rather than cycling around until the next ticketed vcpu gets it. To use this, you must call paravirt_use_bytelocks() very early, before any spinlocks have been taken. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f3381686870..bba4041bb7f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,15 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void __init paravirt_use_bytelocks(void) +{ + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v1.2.3 From 56397f8dadb40055479a8ffff23f21a890098a31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:52 -0700 Subject: xen: use lock-byte spinlock implementation Switch to using the lock-byte spinlock implementation, to avoid the worst of the performance hit from ticket locks. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f702199312a..a8ebafc09d4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -430,4 +430,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); + paravirt_use_bytelocks(); } -- cgit v1.2.3 From 2d9e1e2f58b5612aa4eab0ab54c84308a29dbd79 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:53 -0700 Subject: xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a8ebafc09d4..e693812ac59 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,6 +15,7 @@ * This does not handle HOTPLUG_CPU yet. */ #include +#include #include #include @@ -35,6 +36,8 @@ #include "xen-ops.h" #include "mmu.h" +static void __cpuinit xen_init_lock_cpu(int cpu); + cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -179,6 +182,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + xen_init_lock_cpu(0); + smp_store_cpu_info(0); cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); @@ -301,6 +306,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) clear_tsk_thread_flag(idle, TIF_FORK); #endif xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; @@ -413,6 +419,170 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static __cpuinit void xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +static void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} + static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -430,5 +600,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - paravirt_use_bytelocks(); + xen_init_spinlocks(); } -- cgit v1.2.3 From 4bb689eee12ceb6d669a0c9a519037c049a8af38 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:33:33 +0200 Subject: x86: paravirt spinlocks, !CONFIG_SMP build fixes Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bba4041bb7f..6aa8aed06d5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,11 +270,13 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) void __init paravirt_use_bytelocks(void) { +#ifdef CONFIG_SMP pv_lock_ops.spin_is_locked = __byte_spin_is_locked; pv_lock_ops.spin_is_contended = __byte_spin_is_contended; pv_lock_ops.spin_lock = __byte_spin_lock; pv_lock_ops.spin_trylock = __byte_spin_trylock; pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif } struct pv_info pv_info = { @@ -461,12 +463,14 @@ struct pv_mmu_ops pv_mmu_ops = { }; struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, +#endif }; EXPORT_SYMBOL_GPL(pv_time_ops); -- cgit v1.2.3 From 9af98578d6af588f52d0dacd64fe42caa405a327 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:39:15 +0200 Subject: x86: paravirt spinlocks, modular build fix fix: MODPOST 408 modules ERROR: "pv_lock_ops" [net/dccp/dccp.ko] undefined! ERROR: "pv_lock_ops" [fs/jbd2/jbd2.ko] undefined! ERROR: "pv_lock_ops" [drivers/media/common/saa7146_vv.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6aa8aed06d5..3edfd7af22a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -472,6 +472,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; +EXPORT_SYMBOL_GPL(pv_lock_ops); EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); -- cgit v1.2.3 From 34646bca474142e1424e5f6c4a33cb2ba0930ea1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 15:42:09 +0200 Subject: x86, paravirt-spinlocks: fix boot hang the paravirt-spinlock patches caused a boot hang with this config: http://redhat.com/~mingo/misc/config-Wed_Jul__9_14_47_04_CEST_2008.bad i have bisected it down to: | commit e17b58c2e85bc2ad2afc07fb8d898017c2b75ed1 | Author: Jeremy Fitzhardinge | Date: Mon Jul 7 12:07:53 2008 -0700 | | xen: implement Xen-specific spinlocks i.e. applying that patch alone causes the hang. The hang happens in the ftrace self-test: initcall utsname_sysctl_init+0x0/0x19 returned 0 after 0 msecs calling init_sched_switch_trace+0x0/0x4c Testing tracer sched_switch: PASSED initcall init_sched_switch_trace+0x0/0x4c returned 0 after 167 msecs calling init_function_trace+0x0/0x12 Testing tracer ftrace: [hard hang] it should have continued like this: Testing tracer ftrace: PASSED initcall init_function_trace+0x0/0x12 returned 0 after 198 msecs calling init_irqsoff_tracer+0x0/0x14 Testing tracer irqsoff: PASSED initcall init_irqsoff_tracer+0x0/0x14 returned 0 after 3 msecs calling init_mmio_trace+0x0/0x12 initcall init_mmio_trace+0x0/0x12 returned 0 after 0 msecs the problem is that such lowlevel primitives as spinlocks should never be built with -pg (which ftrace does). Marking paravirt.o as non-pg and marking all spinlock ops as always-inline solve the hang. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5112c84f542..78d52171400 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -7,10 +7,11 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FTRACE -# Do not profile debug utilities +# Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc_64.o = -pg CFLAGS_REMOVE_tsc_32.o = -pg CFLAGS_REMOVE_rtc.o = -pg +CFLAGS_REMOVE_paravirt.o = -pg endif # -- cgit v1.2.3