From ad66dd340f561bdde2285992314d9e4fd9b6191e Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 11 Jul 2008 13:11:56 -0700
Subject: x2apic: xen64 paravirt basic apic ops

Define the Xen specific basic apic ops, in additon to paravirt apic ops,
with some misc warning fixes.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: akpm@linux-foundation.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dcd4e51f2f1..54e25566753 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -548,16 +548,45 @@ static void xen_io_delay(void)
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-static u32 xen_apic_read(unsigned long reg)
+static u32 xen_apic_read(u32 reg)
 {
 	return 0;
 }
 
-static void xen_apic_write(unsigned long reg, u32 val)
+static void xen_apic_write(u32 reg, u32 val)
 {
 	/* Warn to see if there's any stray references */
 	WARN_ON(1);
 }
+
+#ifdef CONFIG_X86_64
+static u64 xen_apic_icr_read(void)
+{
+	return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static void xen_apic_wait_icr_idle(void)
+{
+        return;
+}
+
+static struct apic_ops xen_basic_apic_ops = {
+	.read = xen_apic_read,
+	.write = xen_apic_write,
+	.write_atomic = xen_apic_write,
+	.icr_read = xen_apic_icr_read,
+	.icr_write = xen_apic_icr_write,
+	.wait_icr_idle = xen_apic_wait_icr_idle,
+	.safe_wait_icr_idle = xen_apic_wait_icr_idle,
+};
+#endif
+
 #endif
 
 static void xen_flush_tlb(void)
@@ -1130,9 +1159,11 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_X86_64
 	.apic_write = xen_apic_write,
 	.apic_write_atomic = xen_apic_write,
 	.apic_read = xen_apic_read,
+#endif
 	.setup_boot_clock = paravirt_nop,
 	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
@@ -1291,6 +1322,12 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_irq_ops = xen_irq_ops;
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
+#ifdef CONFIG_X86_64
+	/*
+	 * for 64bit, set up the basic apic ops aswell.
+	 */
+	apic_ops = &xen_basic_apic_ops;
+#endif
 
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
 		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
-- 
cgit v1.2.3


From 94a8c3c2437c8946f1b6c8e0b2c560a7db8ed3c6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 13 Jul 2008 22:19:35 -0700
Subject: x86: let 32bit use apic_ops too - fix

fix for pv - clean up the namespace there too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 54e25566753..d11dda7ebd7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -559,7 +559,6 @@ static void xen_apic_write(u32 reg, u32 val)
 	WARN_ON(1);
 }
 
-#ifdef CONFIG_X86_64
 static u64 xen_apic_icr_read(void)
 {
 	return 0;
@@ -576,6 +575,11 @@ static void xen_apic_wait_icr_idle(void)
         return;
 }
 
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+        return 0;
+}
+
 static struct apic_ops xen_basic_apic_ops = {
 	.read = xen_apic_read,
 	.write = xen_apic_write,
@@ -583,9 +587,8 @@ static struct apic_ops xen_basic_apic_ops = {
 	.icr_read = xen_apic_icr_read,
 	.icr_write = xen_apic_icr_write,
 	.wait_icr_idle = xen_apic_wait_icr_idle,
-	.safe_wait_icr_idle = xen_apic_wait_icr_idle,
+	.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
 };
-#endif
 
 #endif
 
@@ -1159,11 +1162,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
-#ifndef CONFIG_X86_64
-	.apic_write = xen_apic_write,
-	.apic_write_atomic = xen_apic_write,
-	.apic_read = xen_apic_read,
-#endif
 	.setup_boot_clock = paravirt_nop,
 	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
@@ -1322,9 +1320,10 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_irq_ops = xen_irq_ops;
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
-#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_LOCAL_APIC
 	/*
-	 * for 64bit, set up the basic apic ops aswell.
+	 * set up the basic apic ops.
 	 */
 	apic_ops = &xen_basic_apic_ops;
 #endif
-- 
cgit v1.2.3


From caf43bf7c6a55e89b6df5179df434d67e24aa32e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 20 Jul 2008 14:06:50 +0200
Subject: x86, xen: fix apic_ops build on UP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

 arch/x86/xen/enlighten.c:615: error: variable ‘xen_basic_apic_ops’ has initializer but incomplete type
 arch/x86/xen/enlighten.c:616: error: unknown field ‘read’ specified in initializer
 [...]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 008b7b69581..e4d1459a63d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -35,6 +35,7 @@
 #include <xen/page.h>
 
 #include <asm/paravirt.h>
+#include <asm/apic.h>
 #include <asm/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
-- 
cgit v1.2.3


From 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 23 Jul 2008 14:21:18 -0700
Subject: x86/paravirt/xen: properly fill out the ldt ops

LTP testing showed that Xen does not properly implement
sys_modify_ldt().  This patch does the final little bits needed to
make the ldt work properly.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9ff6e3cbf08..06219e60e9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -325,6 +325,26 @@ static unsigned long xen_store_tr(void)
 	return 0;
 }
 
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+	unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+	void *v = ldt;
+	int i;
+
+	for(i = 0; i < pages; i += PAGE_SIZE)
+		make_lowmem_page_readonly(v + i);
+}
+
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+	unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+	void *v = ldt;
+	int i;
+
+	for(i = 0; i < pages; i += PAGE_SIZE)
+		make_lowmem_page_readwrite(v + i);
+}
+
 static void xen_set_ldt(const void *addr, unsigned entries)
 {
 	struct mmuext_op *op;
@@ -1220,6 +1240,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.load_gs_index = xen_load_gs_index,
 #endif
 
+	.alloc_ldt = xen_alloc_ldt,
+	.free_ldt = xen_free_ldt,
+
 	.store_gdt = native_store_gdt,
 	.store_idt = native_store_idt,
 	.store_tr = xen_store_tr,
-- 
cgit v1.2.3


From d5de8841355a48f7f634a04507185eaf1f9755e3 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 23 Jul 2008 13:28:58 -0700
Subject: x86: split spinlock implementations out into their own files

ftrace requires certain low-level code, like spinlocks and timestamps,
to be compiled without -pg in order to avoid infinite recursion.  This
patch splits out the core paravirt spinlocks and the Xen spinlocks
into separate files which can be compiled without -pg.

Also do xen/time.c while we're about it.  As a result, we can now use
ftrace within a Xen domain.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/Makefile   |   8 ++-
 arch/x86/xen/smp.c      | 167 -------------------------------------------
 arch/x86/xen/spinlock.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/xen-ops.h  |   3 +
 4 files changed, 193 insertions(+), 168 deletions(-)
 create mode 100644 arch/x86/xen/spinlock.c

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed..5bfee243cf9 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,10 @@
+ifdef CONFIG_FTRACE
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_spinlock.o = -pg
+CFLAGS_REMOVE_time.o = -pg
+endif
+
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
 			time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
-obj-$(CONFIG_SMP)	+= smp.o
+obj-$(CONFIG_SMP)	+= smp.o spinlock.o
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1..baca7f2fbd8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,7 +15,6 @@
  * This does not handle HOTPLUG_CPU yet.
  */
 #include <linux/sched.h>
-#include <linux/kernel_stat.h>
 #include <linux/err.h>
 #include <linux/smp.h>
 
@@ -36,8 +35,6 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-static void __cpuinit xen_init_lock_cpu(int cpu);
-
 cpumask_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
@@ -419,170 +416,6 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-struct xen_spinlock {
-	unsigned char lock;		/* 0 -> free; 1 -> locked */
-	unsigned short spinners;	/* count of waiting cpus */
-};
-
-static int xen_spin_is_locked(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-	return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-	/* Not strictly true; this is only the count of contended
-	   lock-takers entering the slow path. */
-	return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-	u8 old = 1;
-
-	asm("xchgb %b0,%1"
-	    : "+q" (old), "+m" (xl->lock) : : "memory");
-
-	return old == 0;
-}
-
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-static inline void spinning_lock(struct xen_spinlock *xl)
-{
-	__get_cpu_var(lock_spinners) = xl;
-	wmb();			/* set lock of interest before count */
-	asm(LOCK_PREFIX " incw %0"
-	    : "+m" (xl->spinners) : : "memory");
-}
-
-static inline void unspinning_lock(struct xen_spinlock *xl)
-{
-	asm(LOCK_PREFIX " decw %0"
-	    : "+m" (xl->spinners) : : "memory");
-	wmb();			/* decrement count before clearing lock */
-	__get_cpu_var(lock_spinners) = NULL;
-}
-
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-	int irq = __get_cpu_var(lock_kicker_irq);
-	int ret;
-
-	/* If kicker interrupts not initialized yet, just spin */
-	if (irq == -1)
-		return 0;
-
-	/* announce we're spinning */
-	spinning_lock(xl);
-
-	/* clear pending */
-	xen_clear_irq_pending(irq);
-
-	/* check again make sure it didn't become free while
-	   we weren't looking  */
-	ret = xen_spin_trylock(lock);
-	if (ret)
-		goto out;
-
-	/* block until irq becomes pending */
-	xen_poll_irq(irq);
-	kstat_this_cpu.irqs[irq]++;
-
-out:
-	unspinning_lock(xl);
-	return ret;
-}
-
-static void xen_spin_lock(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-	int timeout;
-	u8 oldval;
-
-	do {
-		timeout = 1 << 10;
-
-		asm("1: xchgb %1,%0\n"
-		    "   testb %1,%1\n"
-		    "   jz 3f\n"
-		    "2: rep;nop\n"
-		    "   cmpb $0,%0\n"
-		    "   je 1b\n"
-		    "   dec %2\n"
-		    "   jnz 2b\n"
-		    "3:\n"
-		    : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
-		    : "1" (1)
-		    : "memory");
-
-	} while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
-}
-
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		/* XXX should mix up next cpu selection */
-		if (per_cpu(lock_spinners, cpu) == xl) {
-			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
-			break;
-		}
-	}
-}
-
-static void xen_spin_unlock(struct raw_spinlock *lock)
-{
-	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-	smp_wmb();		/* make sure no writes get moved after unlock */
-	xl->lock = 0;		/* release lock */
-
-	/* make sure unlock happens before kick */
-	barrier();
-
-	if (unlikely(xl->spinners))
-		xen_spin_unlock_slow(xl);
-}
-
-static __cpuinit void xen_init_lock_cpu(int cpu)
-{
-	int irq;
-	const char *name;
-
-	name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
-	irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
-				     cpu,
-				     xen_reschedule_interrupt,
-				     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
-				     name,
-				     NULL);
-
-	if (irq >= 0) {
-		disable_irq(irq); /* make sure it's never delivered */
-		per_cpu(lock_kicker_irq, cpu) = irq;
-	}
-
-	printk("cpu %d spinlock event irq %d\n", cpu, irq);
-}
-
-static void __init xen_init_spinlocks(void)
-{
-	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
-	pv_lock_ops.spin_is_contended = xen_spin_is_contended;
-	pv_lock_ops.spin_lock = xen_spin_lock;
-	pv_lock_ops.spin_trylock = xen_spin_trylock;
-	pv_lock_ops.spin_unlock = xen_spin_unlock;
-}
-
 static const struct smp_ops xen_smp_ops __initdata = {
 	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = xen_smp_prepare_cpus,
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 00000000000..8dc4d31da67
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,183 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/kernel_stat.h>
+#include <linux/spinlock.h>
+
+#include <asm/paravirt.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+
+struct xen_spinlock {
+	unsigned char lock;		/* 0 -> free; 1 -> locked */
+	unsigned short spinners;	/* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	/* Not strictly true; this is only the count of contended
+	   lock-takers entering the slow path. */
+	return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	u8 old = 1;
+
+	asm("xchgb %b0,%1"
+	    : "+q" (old), "+m" (xl->lock) : : "memory");
+
+	return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+static inline void spinning_lock(struct xen_spinlock *xl)
+{
+	__get_cpu_var(lock_spinners) = xl;
+	wmb();			/* set lock of interest before count */
+	asm(LOCK_PREFIX " incw %0"
+	    : "+m" (xl->spinners) : : "memory");
+}
+
+static inline void unspinning_lock(struct xen_spinlock *xl)
+{
+	asm(LOCK_PREFIX " decw %0"
+	    : "+m" (xl->spinners) : : "memory");
+	wmb();			/* decrement count before clearing lock */
+	__get_cpu_var(lock_spinners) = NULL;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	int irq = __get_cpu_var(lock_kicker_irq);
+	int ret;
+
+	/* If kicker interrupts not initialized yet, just spin */
+	if (irq == -1)
+		return 0;
+
+	/* announce we're spinning */
+	spinning_lock(xl);
+
+	/* clear pending */
+	xen_clear_irq_pending(irq);
+
+	/* check again make sure it didn't become free while
+	   we weren't looking  */
+	ret = xen_spin_trylock(lock);
+	if (ret)
+		goto out;
+
+	/* block until irq becomes pending */
+	xen_poll_irq(irq);
+	kstat_this_cpu.irqs[irq]++;
+
+out:
+	unspinning_lock(xl);
+	return ret;
+}
+
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	int timeout;
+	u8 oldval;
+
+	do {
+		timeout = 1 << 10;
+
+		asm("1: xchgb %1,%0\n"
+		    "   testb %1,%1\n"
+		    "   jz 3f\n"
+		    "2: rep;nop\n"
+		    "   cmpb $0,%0\n"
+		    "   je 1b\n"
+		    "   dec %2\n"
+		    "   jnz 2b\n"
+		    "3:\n"
+		    : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
+		    : "1" (1)
+		    : "memory");
+
+	} while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
+}
+
+static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		/* XXX should mix up next cpu selection */
+		if (per_cpu(lock_spinners, cpu) == xl) {
+			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+			break;
+		}
+	}
+}
+
+static void xen_spin_unlock(struct raw_spinlock *lock)
+{
+	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+	smp_wmb();		/* make sure no writes get moved after unlock */
+	xl->lock = 0;		/* release lock */
+
+	/* make sure unlock happens before kick */
+	barrier();
+
+	if (unlikely(xl->spinners))
+		xen_spin_unlock_slow(xl);
+}
+
+static irqreturn_t dummy_handler(int irq, void *dev_id)
+{
+	BUG();
+	return IRQ_HANDLED;
+}
+
+void __cpuinit xen_init_lock_cpu(int cpu)
+{
+	int irq;
+	const char *name;
+
+	name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+	irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+				     cpu,
+				     dummy_handler,
+				     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				     name,
+				     NULL);
+
+	if (irq >= 0) {
+		disable_irq(irq); /* make sure it's never delivered */
+		per_cpu(lock_kicker_irq, cpu) = irq;
+	}
+
+	printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+void __init xen_init_spinlocks(void)
+{
+	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+	pv_lock_ops.spin_is_contended = xen_spin_is_contended;
+	pv_lock_ops.spin_lock = xen_spin_lock;
+	pv_lock_ops.spin_trylock = xen_spin_trylock;
+	pv_lock_ops.spin_unlock = xen_spin_unlock;
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2..8847fb34f17 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -50,6 +50,9 @@ void __init xen_setup_vcpu_info_placement(void);
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
 
+void __init xen_init_spinlocks(void);
+__cpuinit void xen_init_lock_cpu(int cpu);
+
 extern cpumask_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
-- 
cgit v1.2.3


From b56afe1d41653fb07ab1b5af5ccc12001c4dd5a0 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 24 Jul 2008 12:15:45 -0300
Subject: x86, xen: Use native_pte_flags instead of native_pte_val for
 .pte_flags

Using native_pte_val triggers the BUG_ON() in the paravirt_ops
version of pte_flags().

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 06219e60e9c..e2767c28dac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1347,7 +1347,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
 	.pte_val = xen_pte_val,
-	.pte_flags = native_pte_val,
+	.pte_flags = native_pte_flags,
 	.pgd_val = xen_pgd_val,
 
 	.make_pte = xen_make_pte,
-- 
cgit v1.2.3


From a05d2ebab28011c2f3f520833f4bfdd2fd1b9c02 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 27 Jul 2008 08:45:02 -0700
Subject: xen: fix allocation and use of large ldts

When the ldt gets to more than 1 page in size, the kernel uses vmalloc
to allocate it.  This means that:

 - when making the ldt RO, we must update the pages in both the vmalloc
   mapping and the linear mapping to make sure there are no RW aliases.

 - we need to use arbitrary_virt_to_machine to compute the machine addr
   for each update

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 51 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e2767c28dac..b011e4a5dbb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -325,24 +325,55 @@ static unsigned long xen_store_tr(void)
 	return 0;
 }
 
+/*
+ * If 'v' is a vmalloc mapping, then find the linear mapping of the
+ * page (if any) and also set its protections to match:
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
+{
+	int level;
+	pte_t *ptep;
+	pte_t pte;
+	unsigned long pfn;
+	struct page *page;
+
+	ptep = lookup_address((unsigned long)v, &level);
+	BUG_ON(ptep == NULL);
+
+	pfn = pte_pfn(*ptep);
+	page = pfn_to_page(pfn);
+
+	pte = pfn_pte(pfn, prot);
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+		BUG();
+
+	if (!PageHighMem(page)) {
+		void *av = __va(PFN_PHYS(pfn));
+
+		if (av != v)
+			if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+				BUG();
+	} else
+		kmap_flush_unused();
+}
+
 static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
-	unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE);
-	void *v = ldt;
+	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
 	int i;
 
-	for(i = 0; i < pages; i += PAGE_SIZE)
-		make_lowmem_page_readonly(v + i);
+	for(i = 0; i < entries; i += entries_per_page)
+		set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
 }
 
 static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
 {
-	unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE);
-	void *v = ldt;
+	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
 	int i;
 
-	for(i = 0; i < pages; i += PAGE_SIZE)
-		make_lowmem_page_readwrite(v + i);
+	for(i = 0; i < entries; i += entries_per_page)
+		set_aliased_prot(ldt + i, PAGE_KERNEL);
 }
 
 static void xen_set_ldt(const void *addr, unsigned entries)
@@ -446,7 +477,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 				const void *ptr)
 {
 	unsigned long lp = (unsigned long)&dt[entrynum];
-	xmaddr_t mach_lp = virt_to_machine(lp);
+	xmaddr_t mach_lp = arbitrary_virt_to_machine(lp);
 	u64 entry = *(u64 *)ptr;
 
 	preempt_disable();
@@ -579,7 +610,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 }
 
 static void xen_load_sp0(struct tss_struct *tss,
-			  struct thread_struct *thread)
+			 struct thread_struct *thread)
 {
 	struct multicall_space mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
-- 
cgit v1.2.3


From d89961e2dc87b6e30b8e3f60bd2af5cd92cf4643 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 24 Jul 2008 13:48:58 -0700
Subject: xen: suppress known wrmsrs

In general, Xen doesn't support wrmsr from an unprivileged domain; it
just ends up ignoring the instruction and printing a message on the
console.

Given that there are sets of MSRs we know the kernel will try to write
to, but we don't care, just eat them in xen_write_msr to cut down on
console noise.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b011e4a5dbb..b795470ec06 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -854,6 +854,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 			ret = -EFAULT;
 		break;
 #endif
+
+	case MSR_STAR:
+	case MSR_CSTAR:
+	case MSR_LSTAR:
+	case MSR_SYSCALL_MASK:
+	case MSR_IA32_SYSENTER_CS:
+	case MSR_IA32_SYSENTER_ESP:
+	case MSR_IA32_SYSENTER_EIP:
+		/* Fast syscall setup is all done in hypercalls, so
+		   these are all ignored.  Stub them out here to stop
+		   Xen console noise. */
+		break;
+
 	default:
 		ret = native_write_msr_safe(msr, low, high);
 	}
-- 
cgit v1.2.3


From 0d1edf46ba229b46efacf75c0544b88c05a7b266 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 28 Jul 2008 11:53:57 -0700
Subject: xen: compile irq functions without -pg for ftrace

For some reason I managed to miss a bunch of irq-related functions
which also need to be compiled without -pg when using ftrace.  This
patch moves them into their own file, and starts a cleanup process
I've been meaning to do anyway.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: "Alex Nixon (Intern)" <Alex.Nixon@eu.citrix.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/Makefile     |   3 +-
 arch/x86/xen/enlighten.c  | 122 +--------------------------------------
 arch/x86/xen/irq.c        | 143 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/xen-asm_32.S |   2 +-
 arch/x86/xen/xen-asm_64.S |   2 +-
 arch/x86/xen/xen-ops.h    |   1 +
 6 files changed, 150 insertions(+), 123 deletions(-)
 create mode 100644 arch/x86/xen/irq.c

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 5bfee243cf9..9ee745fa552 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -2,9 +2,10 @@ ifdef CONFIG_FTRACE
 # Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_spinlock.o = -pg
 CFLAGS_REMOVE_time.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
 endif
 
-obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
+obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)	+= smp.o spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b795470ec06..cf8b3a93122 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,7 +30,6 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvc-console.h>
@@ -226,94 +225,6 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-static unsigned long xen_save_fl(void)
-{
-	struct vcpu_info *vcpu;
-	unsigned long flags;
-
-	vcpu = x86_read_percpu(xen_vcpu);
-
-	/* flag has opposite sense of mask */
-	flags = !vcpu->evtchn_upcall_mask;
-
-	/* convert to IF type flag
-	   -0 -> 0x00000000
-	   -1 -> 0xffffffff
-	*/
-	return (-flags) & X86_EFLAGS_IF;
-}
-
-static void xen_restore_fl(unsigned long flags)
-{
-	struct vcpu_info *vcpu;
-
-	/* convert from IF type flag */
-	flags = !(flags & X86_EFLAGS_IF);
-
-	/* There's a one instruction preempt window here.  We need to
-	   make sure we're don't switch CPUs between getting the vcpu
-	   pointer and updating the mask. */
-	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
-	vcpu->evtchn_upcall_mask = flags;
-	preempt_enable_no_resched();
-
-	/* Doesn't matter if we get preempted here, because any
-	   pending event will get dealt with anyway. */
-
-	if (flags == 0) {
-		preempt_check_resched();
-		barrier(); /* unmask then check (avoid races) */
-		if (unlikely(vcpu->evtchn_upcall_pending))
-			force_evtchn_callback();
-	}
-}
-
-static void xen_irq_disable(void)
-{
-	/* There's a one instruction preempt window here.  We need to
-	   make sure we're don't switch CPUs between getting the vcpu
-	   pointer and updating the mask. */
-	preempt_disable();
-	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
-	preempt_enable_no_resched();
-}
-
-static void xen_irq_enable(void)
-{
-	struct vcpu_info *vcpu;
-
-	/* We don't need to worry about being preempted here, since
-	   either a) interrupts are disabled, so no preemption, or b)
-	   the caller is confused and is trying to re-enable interrupts
-	   on an indeterminate processor. */
-
-	vcpu = x86_read_percpu(xen_vcpu);
-	vcpu->evtchn_upcall_mask = 0;
-
-	/* Doesn't matter if we get preempted here, because any
-	   pending event will get dealt with anyway. */
-
-	barrier(); /* unmask then check (avoid races) */
-	if (unlikely(vcpu->evtchn_upcall_pending))
-		force_evtchn_callback();
-}
-
-static void xen_safe_halt(void)
-{
-	/* Blocking includes an implicit local_irq_enable(). */
-	if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
-		BUG();
-}
-
-static void xen_halt(void)
-{
-	if (irqs_disabled())
-		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
-	else
-		xen_safe_halt();
-}
-
 static void xen_leave_lazy(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
@@ -1308,36 +1219,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	},
 };
 
-static void __init __xen_init_IRQ(void)
-{
-#ifdef CONFIG_X86_64
-	int i;
-
-	/* Create identity vector->irq map */
-	for(i = 0; i < NR_VECTORS; i++) {
-		int cpu;
-
-		for_each_possible_cpu(cpu)
-			per_cpu(vector_irq, cpu)[i] = i;
-	}
-#endif	/* CONFIG_X86_64 */
-
-	xen_init_IRQ();
-}
-
-static const struct pv_irq_ops xen_irq_ops __initdata = {
-	.init_IRQ = __xen_init_IRQ,
-	.save_fl = xen_save_fl,
-	.restore_fl = xen_restore_fl,
-	.irq_disable = xen_irq_disable,
-	.irq_enable = xen_irq_enable,
-	.safe_halt = xen_safe_halt,
-	.halt = xen_halt,
-#ifdef CONFIG_X86_64
-	.adjust_exception_frame = xen_adjust_exception_frame,
-#endif
-};
-
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = xen_apic_write,
@@ -1740,10 +1621,11 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_init_ops = xen_init_ops;
 	pv_time_ops = xen_time_ops;
 	pv_cpu_ops = xen_cpu_ops;
-	pv_irq_ops = xen_irq_ops;
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
 
+	xen_init_irq_ops();
+
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
 		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
 		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 00000000000..28b85ab8422
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,143 @@
+#include <linux/hardirq.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void xen_force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+	int i;
+
+	/* Create identity vector->irq map */
+	for(i = 0; i < NR_VECTORS; i++) {
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			per_cpu(vector_irq, cpu)[i] = i;
+	}
+#endif	/* CONFIG_X86_64 */
+
+	xen_init_IRQ();
+}
+
+static unsigned long xen_save_fl(void)
+{
+	struct vcpu_info *vcpu;
+	unsigned long flags;
+
+	vcpu = x86_read_percpu(xen_vcpu);
+
+	/* flag has opposite sense of mask */
+	flags = !vcpu->evtchn_upcall_mask;
+
+	/* convert to IF type flag
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+	struct vcpu_info *vcpu;
+
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = flags;
+	preempt_enable_no_resched();
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	if (flags == 0) {
+		preempt_check_resched();
+		barrier(); /* unmask then check (avoid races) */
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			xen_force_evtchn_callback();
+	}
+}
+
+static void xen_irq_disable(void)
+{
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+	struct vcpu_info *vcpu;
+
+	/* We don't need to worry about being preempted here, since
+	   either a) interrupts are disabled, so no preemption, or b)
+	   the caller is confused and is trying to re-enable interrupts
+	   on an indeterminate processor. */
+
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = 0;
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	barrier(); /* unmask then check (avoid races) */
+	if (unlikely(vcpu->evtchn_upcall_pending))
+		xen_force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+	/* Blocking includes an implicit local_irq_enable(). */
+	if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
+		BUG();
+}
+
+static void xen_halt(void)
+{
+	if (irqs_disabled())
+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+	else
+		xen_safe_halt();
+}
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+	.init_IRQ = __xen_init_IRQ,
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+#ifdef CONFIG_X86_64
+	.adjust_exception_frame = xen_adjust_exception_frame,
+#endif
+};
+
+void __init xen_init_irq_ops()
+{
+	pv_irq_ops = xen_irq_ops;
+}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41d..42786f59d9c 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
 	push %eax
 	push %ecx
 	push %edx
-	call force_evtchn_callback
+	call xen_force_evtchn_callback
 	pop %edx
 	pop %ecx
 	pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb..3b9bda46487 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -122,7 +122,7 @@ check_events:
 	push %r9
 	push %r10
 	push %r11
-	call force_evtchn_callback
+	call xen_force_evtchn_callback
 	pop %r11
 	pop %r10
 	pop %r9
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8847fb34f17..3c70ebc50b1 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,6 +31,7 @@ void xen_vcpu_restore(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
 
+void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);
 void xen_setup_cpu_clockevents(void);
 unsigned long xen_tsc_khz(void);
-- 
cgit v1.2.3


From cef43bf6b3afd819f7cdcba356af0e8220fb3789 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 28 Jul 2008 13:33:44 -0700
Subject: xen: fix allocation and use of large ldts, cleanup

Add a proper comment for set_aliased_prot() and fix an
unsigned long/void * warning.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index cf8b3a93122..04ec69e4d02 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -237,8 +237,10 @@ static unsigned long xen_store_tr(void)
 }
 
 /*
- * If 'v' is a vmalloc mapping, then find the linear mapping of the
- * page (if any) and also set its protections to match:
+ * Set the page permissions for a particular virtual address.  If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
  */
 static void set_aliased_prot(void *v, pgprot_t prot)
 {
@@ -387,8 +389,7 @@ static void xen_load_gs_index(unsigned int idx)
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 				const void *ptr)
 {
-	unsigned long lp = (unsigned long)&dt[entrynum];
-	xmaddr_t mach_lp = arbitrary_virt_to_machine(lp);
+	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
 	u64 entry = *(u64 *)ptr;
 
 	preempt_disable();
-- 
cgit v1.2.3


From 169ad16bb87c10a3f7c108bb7008ebc0270f617a Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 28 Jul 2008 18:32:09 -0300
Subject: xen_alloc_ptpage: cast PFN_PHYS() argument to unsigned long

Currently paravirt_ops alloc_p*() uses u32 for the pfn args. We should
change that later, but while the pfn parameter is still u32, we need to
cast the PFN_PHYS() argument at xen_alloc_ptpage() to unsigned long,
otherwise it will lose bits on the shift.

I think PFN_PHYS() should behave better when fed with smaller integers,
but a cast to unsigned long won't be enough for all cases on 32-bit PAE,
and a cast to u64 would be overkill for most users of PFN_PHYS().

We could have two different flavors of PFN_PHYS: one for low pages
only (unsigned long) and another that works for any page (u64)),
but while we don't have it, we will need the cast to unsigned long on
xen_alloc_ptpage().

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 04ec69e4d02..53afa14eb31 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -822,7 +822,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
 		SetPagePinned(page);
 
 		if (!PageHighMem(page)) {
-			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
 			if (level == PT_PTE)
 				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 		} else
-- 
cgit v1.2.3


From 6e833587e11ed0dbf12e647127f2650e2f80b26d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 19 Aug 2008 13:16:17 -0700
Subject: xen: clean up domain mode predicates

There are four operating modes Xen code may find itself running in:
 - native
 - hvm domain
 - pv dom0
 - pv domU

Clean up predicates for testing for these states to make them more consistent.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 53afa14eb31..b106e825d26 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
 /*
  * Identity map, in addition to plain kernel map.  This needs to be
  * large enough to allocate page table pages to allocate the rest.
@@ -1613,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void)
 	if (!xen_start_info)
 		return;
 
+	xen_domain_type = XEN_PV_DOMAIN;
+
 	BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
 
 	xen_setup_features();
@@ -1650,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void)
 
 	/* Prevent unwanted bits from being set in PTEs. */
 	__supported_pte_mask &= ~_PAGE_GLOBAL;
-	if (!is_initial_xendomain())
+	if (!xen_initial_domain())
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
 	/* Don't do the full vcpu_info placement stuff until we have a
@@ -1685,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void)
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
-	if (!is_initial_xendomain()) {
+	if (!xen_initial_domain()) {
 		add_preferred_console("xenboot", 0, NULL);
 		add_preferred_console("tty", 0, NULL);
 		add_preferred_console("hvc", 0, NULL);
-- 
cgit v1.2.3


From 11ad93e59d114f4b218873f1c93261be725d2e22 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 19 Aug 2008 13:32:51 -0700
Subject: xen: clarify locking used when pinning a pagetable.

Add some comments explaining the locking and pinning algorithm when
using split pte locks.  Also implement a minor optimisation of not
pinning the PTE when not using split pte locks.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da69..d3752b6ce6e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -590,8 +590,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 	pmdidx_limit = 0;
 #endif
 
-	flush |= (*func)(virt_to_page(pgd), PT_PGD);
-
 	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 		pud_t *pud;
 
@@ -637,7 +635,11 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 			}
 		}
 	}
+
 out:
+	/* Do the top level last, so that the callbacks can use it as
+	   a cue to do final things like tlb flushes. */
+	flush |= (*func)(virt_to_page(pgd), PT_PGD);
 
 	return flush;
 }
@@ -691,6 +693,26 @@ static int pin_page(struct page *page, enum pt_level level)
 
 		flush = 0;
 
+		/*
+		 * We need to hold the pagetable lock between the time
+		 * we make the pagetable RO and when we actually pin
+		 * it.  If we don't, then other users may come in and
+		 * attempt to update the pagetable by writing it,
+		 * which will fail because the memory is RO but not
+		 * pinned, so Xen won't do the trap'n'emulate.
+		 *
+		 * If we're using split pte locks, we can't hold the
+		 * entire pagetable's worth of locks during the
+		 * traverse, because we may wrap the preempt count (8
+		 * bits).  The solution is to mark RO and pin each PTE
+		 * page while holding the lock.  This means the number
+		 * of locks we end up holding is never more than a
+		 * batch size (~32 entries, at present).
+		 *
+		 * If we're not using split pte locks, we needn't pin
+		 * the PTE pages independently, because we're
+		 * protected by the overall pagetable lock.
+		 */
 		ptl = NULL;
 		if (level == PT_PTE)
 			ptl = lock_pte(page);
@@ -699,10 +721,9 @@ static int pin_page(struct page *page, enum pt_level level)
 					pfn_pte(pfn, PAGE_KERNEL_RO),
 					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 
-		if (level == PT_PTE)
+		if (ptl) {
 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 
-		if (ptl) {
 			/* Queue a deferred unlock for when this batch
 			   is completed. */
 			xen_mc_callback(do_unlock, ptl);
@@ -796,10 +817,18 @@ static int unpin_page(struct page *page, enum pt_level level)
 		spinlock_t *ptl = NULL;
 		struct multicall_space mcs;
 
+		/*
+		 * Do the converse to pin_page.  If we're using split
+		 * pte locks, we must be holding the lock for while
+		 * the pte page is unpinned but still RO to prevent
+		 * concurrent updates from seeing it in this
+		 * partially-pinned state.
+		 */
 		if (level == PT_PTE) {
 			ptl = lock_pte(page);
 
-			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+			if (ptl)
+				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 		}
 
 		mcs = __xen_mc_entry(0);
@@ -837,7 +866,7 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
 	/* Need to make sure unshared kernel PMD is unpinned */
-	pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+	unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
 #endif
 
 	pgd_walk(pgd, unpin_page, USER_LIMIT);
-- 
cgit v1.2.3


From 7708ad64a24a674f7905aa7a5099a50f055debec Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 19 Aug 2008 13:34:22 -0700
Subject: xen: add xen_ prefixes to make tracing with ftrace easier

It's easier to pattern match on Xen function if they all start with xen_.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c | 66 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d3752b6ce6e..d9a35a36309 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -229,14 +229,14 @@ void make_lowmem_page_readwrite(void *vaddr)
 }
 
 
-static bool page_pinned(void *ptr)
+static bool xen_page_pinned(void *ptr)
 {
 	struct page *page = virt_to_page(ptr);
 
 	return PagePinned(page);
 }
 
-static void extend_mmu_update(const struct mmu_update *update)
+static void xen_extend_mmu_update(const struct mmu_update *update)
 {
 	struct multicall_space mcs;
 	struct mmu_update *u;
@@ -265,7 +265,7 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 	/* ptr may be ioremapped for 64-bit pagetable setup */
 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 	u.val = pmd_val_ma(val);
-	extend_mmu_update(&u);
+	xen_extend_mmu_update(&u);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
@@ -276,7 +276,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
 	/* If page is not pinned, we can just update the entry
 	   directly */
-	if (!page_pinned(ptr)) {
+	if (!xen_page_pinned(ptr)) {
 		*ptr = val;
 		return;
 	}
@@ -334,7 +334,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 
 	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 	u.val = pte_val_ma(pte);
-	extend_mmu_update(&u);
+	xen_extend_mmu_update(&u);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
@@ -400,7 +400,7 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 	/* ptr may be ioremapped for 64-bit pagetable setup */
 	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 	u.val = pud_val_ma(val);
-	extend_mmu_update(&u);
+	xen_extend_mmu_update(&u);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
@@ -411,7 +411,7 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 {
 	/* If page is not pinned, we can just update the entry
 	   directly */
-	if (!page_pinned(ptr)) {
+	if (!xen_page_pinned(ptr)) {
 		*ptr = val;
 		return;
 	}
@@ -490,7 +490,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 
 	u.ptr = virt_to_machine(ptr).maddr;
 	u.val = pgd_val_ma(val);
-	extend_mmu_update(&u);
+	xen_extend_mmu_update(&u);
 }
 
 /*
@@ -519,10 +519,10 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
 
 	/* If page is not pinned, we can just update the entry
 	   directly */
-	if (!page_pinned(ptr)) {
+	if (!xen_page_pinned(ptr)) {
 		*ptr = val;
 		if (user_ptr) {
-			WARN_ON(page_pinned(user_ptr));
+			WARN_ON(xen_page_pinned(user_ptr));
 			*user_ptr = val;
 		}
 		return;
@@ -555,8 +555,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
  * For 64-bit, we must skip the Xen hole in the middle of the address
  * space, just after the big x86-64 virtual hole.
  */
-static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
-		    unsigned long limit)
+static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
+			unsigned long limit)
 {
 	int flush = 0;
 	unsigned hole_low, hole_high;
@@ -644,7 +644,9 @@ out:
 	return flush;
 }
 
-static spinlock_t *lock_pte(struct page *page)
+/* If we're using split pte locks, then take the page's lock and
+   return a pointer to it.  Otherwise return NULL. */
+static spinlock_t *xen_pte_lock(struct page *page)
 {
 	spinlock_t *ptl = NULL;
 
@@ -656,7 +658,7 @@ static spinlock_t *lock_pte(struct page *page)
 	return ptl;
 }
 
-static void do_unlock(void *v)
+static void xen_pte_unlock(void *v)
 {
 	spinlock_t *ptl = v;
 	spin_unlock(ptl);
@@ -674,7 +676,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 }
 
-static int pin_page(struct page *page, enum pt_level level)
+static int xen_pin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = TestSetPagePinned(page);
 	int flush;
@@ -715,7 +717,7 @@ static int pin_page(struct page *page, enum pt_level level)
 		 */
 		ptl = NULL;
 		if (level == PT_PTE)
-			ptl = lock_pte(page);
+			ptl = xen_pte_lock(page);
 
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL_RO),
@@ -726,7 +728,7 @@ static int pin_page(struct page *page, enum pt_level level)
 
 			/* Queue a deferred unlock for when this batch
 			   is completed. */
-			xen_mc_callback(do_unlock, ptl);
+			xen_mc_callback(xen_pte_unlock, ptl);
 		}
 	}
 
@@ -740,7 +742,7 @@ void xen_pgd_pin(pgd_t *pgd)
 {
 	xen_mc_batch();
 
-	if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
+	if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) {
 		/* re-enable interrupts for kmap_flush_unused */
 		xen_mc_issue(0);
 		kmap_flush_unused();
@@ -754,14 +756,14 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 
 		if (user_pgd) {
-			pin_page(virt_to_page(user_pgd), PT_PGD);
+			xen_pin_page(virt_to_page(user_pgd), PT_PGD);
 			xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
 		}
 	}
 #else /* CONFIG_X86_32 */
 #ifdef CONFIG_X86_PAE
 	/* Need to make sure unshared kernel PMD is pinnable */
-	pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+	xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
 #endif
 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 #endif /* CONFIG_X86_64 */
@@ -796,7 +798,7 @@ void xen_mm_pin_all(void)
  * that's before we have page structures to store the bits.  So do all
  * the book-keeping now.
  */
-static __init int mark_pinned(struct page *page, enum pt_level level)
+static __init int xen_mark_pinned(struct page *page, enum pt_level level)
 {
 	SetPagePinned(page);
 	return 0;
@@ -804,10 +806,10 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
 
 void __init xen_mark_init_mm_pinned(void)
 {
-	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+	xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP);
 }
 
-static int unpin_page(struct page *page, enum pt_level level)
+static int xen_unpin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = TestClearPagePinned(page);
 
@@ -825,7 +827,7 @@ static int unpin_page(struct page *page, enum pt_level level)
 		 * partially-pinned state.
 		 */
 		if (level == PT_PTE) {
-			ptl = lock_pte(page);
+			ptl = xen_pte_lock(page);
 
 			if (ptl)
 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
@@ -839,7 +841,7 @@ static int unpin_page(struct page *page, enum pt_level level)
 
 		if (ptl) {
 			/* unlock when batch completed */
-			xen_mc_callback(do_unlock, ptl);
+			xen_mc_callback(xen_pte_unlock, ptl);
 		}
 	}
 
@@ -859,17 +861,17 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
 		if (user_pgd) {
 			xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
-			unpin_page(virt_to_page(user_pgd), PT_PGD);
+			xen_unpin_page(virt_to_page(user_pgd), PT_PGD);
 		}
 	}
 #endif
 
 #ifdef CONFIG_X86_PAE
 	/* Need to make sure unshared kernel PMD is unpinned */
-	unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+	xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
 #endif
 
-	pgd_walk(pgd, unpin_page, USER_LIMIT);
+	xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT);
 
 	xen_mc_issue(0);
 }
@@ -936,7 +938,7 @@ static void drop_other_mm_ref(void *info)
 	}
 }
 
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
 {
 	cpumask_t mask;
 	unsigned cpu;
@@ -966,7 +968,7 @@ static void drop_mm_ref(struct mm_struct *mm)
 		smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
 {
 	if (current->active_mm == mm)
 		load_cr3(swapper_pg_dir);
@@ -990,13 +992,13 @@ static void drop_mm_ref(struct mm_struct *mm)
 void xen_exit_mmap(struct mm_struct *mm)
 {
 	get_cpu();		/* make sure we don't move around */
-	drop_mm_ref(mm);
+	xen_drop_mm_ref(mm);
 	put_cpu();
 
 	spin_lock(&mm->page_table_lock);
 
 	/* pgd may not be pinned in the error exit path of execve */
-	if (page_pinned(mm->pgd))
+	if (xen_page_pinned(mm->pgd))
 		xen_pgd_unpin(mm->pgd);
 
 	spin_unlock(&mm->page_table_lock);
-- 
cgit v1.2.3


From 168d2f464ab9860f0d1e66cf1f9684973222f1c6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 20 Aug 2008 17:02:18 -0700
Subject: xen: save previous spinlock when blocking

A spinlock can be interrupted while spinning, so make sure we preserve
the previous lock of interest if we're taking a lock from within an
interrupt handler.

We also need to deal with the case where the blocking path gets
interrupted between testing to see if the lock is free and actually
blocking.  If we get interrupted there and end up in the state where
the lock is free but the irq isn't pending, then we'll block
indefinitely in the hypervisor.  This fix is to make sure that any
nested lock-takers will always leave the irq pending if there's any
chance the outer lock became free.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/spinlock.c | 65 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 15 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 8dc4d31da67..4884bc603aa 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -47,25 +47,41 @@ static int xen_spin_trylock(struct raw_spinlock *lock)
 static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
 static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
 
-static inline void spinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as interested in a lock.  Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
 {
+	struct xen_spinlock *prev;
+
+	prev = __get_cpu_var(lock_spinners);
 	__get_cpu_var(lock_spinners) = xl;
+
 	wmb();			/* set lock of interest before count */
+
 	asm(LOCK_PREFIX " incw %0"
 	    : "+m" (xl->spinners) : : "memory");
+
+	return prev;
 }
 
-static inline void unspinning_lock(struct xen_spinlock *xl)
+/*
+ * Mark a cpu as no longer interested in a lock.  Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
 {
 	asm(LOCK_PREFIX " decw %0"
 	    : "+m" (xl->spinners) : : "memory");
-	wmb();			/* decrement count before clearing lock */
-	__get_cpu_var(lock_spinners) = NULL;
+	wmb();			/* decrement count before restoring lock */
+	__get_cpu_var(lock_spinners) = prev;
 }
 
 static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+	struct xen_spinlock *prev;
 	int irq = __get_cpu_var(lock_kicker_irq);
 	int ret;
 
@@ -74,23 +90,42 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 		return 0;
 
 	/* announce we're spinning */
-	spinning_lock(xl);
+	prev = spinning_lock(xl);
 
-	/* clear pending */
-	xen_clear_irq_pending(irq);
+	do {
+		/* clear pending */
+		xen_clear_irq_pending(irq);
+
+		/* check again make sure it didn't become free while
+		   we weren't looking  */
+		ret = xen_spin_trylock(lock);
+		if (ret) {
+			/*
+			 * If we interrupted another spinlock while it
+			 * was blocking, make sure it doesn't block
+			 * without rechecking the lock.
+			 */
+			if (prev != NULL)
+				xen_set_irq_pending(irq);
+			goto out;
+		}
 
-	/* check again make sure it didn't become free while
-	   we weren't looking  */
-	ret = xen_spin_trylock(lock);
-	if (ret)
-		goto out;
+		/*
+		 * Block until irq becomes pending.  If we're
+		 * interrupted at this point (after the trylock but
+		 * before entering the block), then the nested lock
+		 * handler guarantees that the irq will be left
+		 * pending if there's any chance the lock became free;
+		 * xen_poll_irq() returns immediately if the irq is
+		 * pending.
+		 */
+		xen_poll_irq(irq);
+	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
 
-	/* block until irq becomes pending */
-	xen_poll_irq(irq);
 	kstat_this_cpu.irqs[irq]++;
 
 out:
-	unspinning_lock(xl);
+	unspinning_lock(xl, prev);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 994025caba3e6beade9bde84dd1b70d9d250f27b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 20 Aug 2008 17:02:19 -0700
Subject: xen: add debugfs support

Add support for exporting statistics on mmu updates, multicall
batching and pv spinlocks into debugfs. The base path is xen/ and
each subsystem adds its own directory: mmu, multicalls, spinlocks.

In each directory, writing 1 to "zero_stats" will cause the
corresponding stats to be zeroed the next time they're updated.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/Kconfig      |  10 ++-
 arch/x86/xen/Makefile     |   3 +-
 arch/x86/xen/debugfs.c    | 123 ++++++++++++++++++++++++++++++++++
 arch/x86/xen/debugfs.h    |  10 +++
 arch/x86/xen/mmu.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/xen/multicalls.c | 115 +++++++++++++++++++++++++++++++-
 arch/x86/xen/spinlock.c   | 165 +++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 580 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/xen/debugfs.c
 create mode 100644 arch/x86/xen/debugfs.h

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f47..d3e68465ace 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -27,4 +27,12 @@ config XEN_MAX_DOMAIN_MEMORY
 config XEN_SAVE_RESTORE
        bool
        depends on PM
-       default y
\ No newline at end of file
+       default y
+
+config XEN_DEBUG_FS
+	bool "Enable Xen debug and tuning parameters in debugfs"
+	depends on XEN && DEBUG_FS
+	default n
+	help
+	  Enable statistics output and various tuning options in debugfs.
+	  Enabling this option may incur a significant performance overhead.
\ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 9ee745fa552..313947940a1 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -8,4 +8,5 @@ endif
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
-obj-$(CONFIG_SMP)	+= smp.o spinlock.o
+obj-$(CONFIG_SMP)		+= smp.o spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
\ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 00000000000..b53225d2cac
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "debugfs.h"
+
+static struct dentry *d_xen_debug;
+
+struct dentry * __init xen_init_debugfs(void)
+{
+	if (!d_xen_debug) {
+		d_xen_debug = debugfs_create_dir("xen", NULL);
+
+		if (!d_xen_debug)
+			pr_warning("Could not create 'xen' debugfs directory\n");
+	}
+
+	return d_xen_debug;
+}
+
+struct array_data
+{
+	void *array;
+	unsigned elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+			   u32 *array, unsigned array_size)
+{
+	size_t ret = 0;
+	unsigned i;
+
+	for(i = 0; i < array_size; i++) {
+		size_t len;
+
+		len = snprintf(buf, bufsize, fmt, array[i]);
+		len++;	/* ' ' or '\n' */
+		ret += len;
+
+		if (buf) {
+			buf += len;
+			bufsize -= len;
+			buf[-1] = (i == array_size-1) ? '\n' : ' ';
+		}
+	}
+
+	ret++;		/* \0 */
+	if (buf)
+		*buf = '\0';
+
+	return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
+{
+	size_t len = format_array(NULL, 0, fmt, array, array_size);
+	char *ret;
+
+	ret = kmalloc(len, GFP_KERNEL);
+	if (ret == NULL)
+		return NULL;
+
+	format_array(ret, len, fmt, array, array_size);
+	return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+			      loff_t *ppos)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct array_data *data = inode->i_private;
+	size_t size;
+
+	if (*ppos == 0) {
+		if (file->private_data) {
+			kfree(file->private_data);
+			file->private_data = NULL;
+		}
+
+		file->private_data = format_array_alloc("%u", data->array, data->elements);
+	}
+
+	size = 0;
+	if (file->private_data)
+		size = strlen(file->private_data);
+
+	return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
+}
+
+static int xen_array_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+
+	return 0;
+}
+
+static struct file_operations u32_array_fops = {
+	.owner	= THIS_MODULE,
+	.open	= u32_array_open,
+	.release= xen_array_release,
+	.read	= u32_array_read,
+};
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+					    struct dentry *parent,
+					    u32 *array, unsigned elements)
+{
+	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data == NULL)
+		return NULL;
+
+	data->array = array;
+	data->elements = elements;
+
+	return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 00000000000..e2813208483
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
+#ifndef _XEN_DEBUGFS_H
+#define _XEN_DEBUGFS_H
+
+struct dentry * __init xen_init_debugfs(void);
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+					    struct dentry *parent,
+					    u32 *array, unsigned elements);
+
+#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d9a35a36309..f5af913fd7b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
  */
 #include <linux/sched.h>
 #include <linux/highmem.h>
+#include <linux/debugfs.h>
 #include <linux/bug.h>
 
 #include <asm/pgtable.h>
@@ -57,6 +58,61 @@
 
 #include "multicalls.h"
 #include "mmu.h"
+#include "debugfs.h"
+
+#define MMU_UPDATE_HISTO	30
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct {
+	u32 pgd_update;
+	u32 pgd_update_pinned;
+	u32 pgd_update_batched;
+
+	u32 pud_update;
+	u32 pud_update_pinned;
+	u32 pud_update_batched;
+
+	u32 pmd_update;
+	u32 pmd_update_pinned;
+	u32 pmd_update_batched;
+
+	u32 pte_update;
+	u32 pte_update_pinned;
+	u32 pte_update_batched;
+
+	u32 mmu_update;
+	u32 mmu_update_extended;
+	u32 mmu_update_histo[MMU_UPDATE_HISTO];
+
+	u32 prot_commit;
+	u32 prot_commit_batched;
+
+	u32 set_pte_at;
+	u32 set_pte_at_batched;
+	u32 set_pte_at_pinned;
+	u32 set_pte_at_current;
+	u32 set_pte_at_kernel;
+} mmu_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+	if (unlikely(zero_stats)) {
+		memset(&mmu_stats, 0, sizeof(mmu_stats));
+		zero_stats = 0;
+	}
+}
+
+#define ADD_STATS(elem, val)			\
+	do { check_zero(); mmu_stats.elem += (val); } while(0)
+
+#else  /* !CONFIG_XEN_DEBUG_FS */
+
+#define ADD_STATS(elem, val)	do { (void)(val); } while(0)
+
+#endif /* CONFIG_XEN_DEBUG_FS */
 
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -243,11 +299,21 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
 
 	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 
-	if (mcs.mc != NULL)
+	if (mcs.mc != NULL) {
+		ADD_STATS(mmu_update_extended, 1);
+		ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
+
 		mcs.mc->args[1]++;
-	else {
+
+		if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
+			ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
+		else
+			ADD_STATS(mmu_update_histo[0], 1);
+	} else {
+		ADD_STATS(mmu_update, 1);
 		mcs = __xen_mc_entry(sizeof(*u));
 		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+		ADD_STATS(mmu_update_histo[1], 1);
 	}
 
 	u = mcs.args;
@@ -267,6 +333,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 	u.val = pmd_val_ma(val);
 	xen_extend_mmu_update(&u);
 
+	ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
 	preempt_enable();
@@ -274,6 +342,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
+	ADD_STATS(pmd_update, 1);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -281,6 +351,8 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
 		return;
 	}
 
+	ADD_STATS(pmd_update_pinned, 1);
+
 	xen_set_pmd_hyper(ptr, val);
 }
 
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	if (mm == &init_mm)
 		preempt_disable();
 
+	ADD_STATS(set_pte_at, 1);
+//	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
+	ADD_STATS(set_pte_at_current, mm == current->mm);
+	ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+
 	if (mm == current->mm || mm == &init_mm) {
 		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 			struct multicall_space mcs;
 			mcs = xen_mc_entry(0);
 
 			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+			ADD_STATS(set_pte_at_batched, 1);
 			xen_mc_issue(PARAVIRT_LAZY_MMU);
 			goto out;
 		} else
@@ -336,6 +414,9 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 	u.val = pte_val_ma(pte);
 	xen_extend_mmu_update(&u);
 
+	ADD_STATS(prot_commit, 1);
+	ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
@@ -402,6 +483,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 	u.val = pud_val_ma(val);
 	xen_extend_mmu_update(&u);
 
+	ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
 	preempt_enable();
@@ -409,6 +492,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
+	ADD_STATS(pud_update, 1);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -416,11 +501,17 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 		return;
 	}
 
+	ADD_STATS(pud_update_pinned, 1);
+
 	xen_set_pud_hyper(ptr, val);
 }
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
+	ADD_STATS(pte_update, 1);
+//	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
+	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 #ifdef CONFIG_X86_PAE
 	ptep->pte_high = pte.pte_high;
 	smp_wmb();
@@ -517,6 +608,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
 {
 	pgd_t *user_ptr = xen_get_user_pgd(ptr);
 
+	ADD_STATS(pgd_update, 1);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -528,6 +621,9 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
 		return;
 	}
 
+	ADD_STATS(pgd_update_pinned, 1);
+	ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 	/* If it's pinned, then we can at least batch the kernel and
 	   user updates together. */
 	xen_mc_batch();
@@ -1003,3 +1099,66 @@ void xen_exit_mmap(struct mm_struct *mm)
 
 	spin_unlock(&mm->page_table_lock);
 }
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_mmu_debugfs(void)
+{
+	struct dentry *d_xen = xen_init_debugfs();
+
+	if (d_xen == NULL)
+		return -ENOMEM;
+
+	d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+	debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
+
+	debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
+	debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
+			   &mmu_stats.pgd_update_pinned);
+	debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
+			   &mmu_stats.pgd_update_pinned);
+
+	debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
+	debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
+			   &mmu_stats.pud_update_pinned);
+	debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
+			   &mmu_stats.pud_update_pinned);
+
+	debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
+	debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
+			   &mmu_stats.pmd_update_pinned);
+	debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
+			   &mmu_stats.pmd_update_pinned);
+
+	debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
+//	debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
+//			   &mmu_stats.pte_update_pinned);
+	debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
+			   &mmu_stats.pte_update_pinned);
+
+	debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
+	debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
+			   &mmu_stats.mmu_update_extended);
+	xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
+				     mmu_stats.mmu_update_histo, 20);
+
+	debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
+	debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
+			   &mmu_stats.set_pte_at_batched);
+	debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
+			   &mmu_stats.set_pte_at_current);
+	debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
+			   &mmu_stats.set_pte_at_kernel);
+
+	debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
+	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
+			   &mmu_stats.prot_commit_batched);
+
+	return 0;
+}
+fs_initcall(xen_mmu_debugfs);
+
+#endif	/* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c977..8ea8a0d0b0d 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
  */
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/debugfs.h>
 
 #include <asm/xen/hypercall.h>
 
 #include "multicalls.h"
+#include "debugfs.h"
+
+#define MC_BATCH	32
 
 #define MC_DEBUG	1
 
-#define MC_BATCH	32
 #define MC_ARGS		(MC_BATCH * 16)
 
+
 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
 DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
+/* flush reasons 0- slots, 1- args, 2- callbacks */
+enum flush_reasons
+{
+	FL_SLOTS,
+	FL_ARGS,
+	FL_CALLBACKS,
+
+	FL_N_REASONS
+};
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#define NHYPERCALLS	40		/* not really */
+
+static struct {
+	unsigned histo[MC_BATCH+1];
+
+	unsigned issued;
+	unsigned arg_total;
+	unsigned hypercalls;
+	unsigned histo_hypercalls[NHYPERCALLS];
+
+	unsigned flush[FL_N_REASONS];
+} mc_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+	if (unlikely(zero_stats)) {
+		memset(&mc_stats, 0, sizeof(mc_stats));
+		zero_stats = 0;
+	}
+}
+
+static void mc_add_stats(const struct mc_buffer *mc)
+{
+	int i;
+
+	check_zero();
+
+	mc_stats.issued++;
+	mc_stats.hypercalls += mc->mcidx;
+	mc_stats.arg_total += mc->argidx;
+
+	mc_stats.histo[mc->mcidx]++;
+	for(i = 0; i < mc->mcidx; i++) {
+		unsigned op = mc->entries[i].op;
+		if (op < NHYPERCALLS)
+			mc_stats.histo_hypercalls[op]++;
+	}
+}
+
+static void mc_stats_flush(enum flush_reasons idx)
+{
+	check_zero();
+
+	mc_stats.flush[idx]++;
+}
+
+#else  /* !CONFIG_XEN_DEBUG_FS */
+
+static inline void mc_add_stats(const struct mc_buffer *mc)
+{
+}
+
+static inline void mc_stats_flush(enum flush_reasons idx)
+{
+}
+#endif	/* CONFIG_XEN_DEBUG_FS */
+
 void xen_mc_flush(void)
 {
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
 	   something in the middle */
 	local_irq_save(flags);
 
+	mc_add_stats(b);
+
 	if (b->mcidx) {
 #if MC_DEBUG
 		memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
 
 	if (b->mcidx == MC_BATCH ||
 	    (argidx + args) > MC_ARGS) {
+		mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
 		xen_mc_flush();
 		argidx = roundup(b->argidx, sizeof(u64));
 	}
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	struct callback *cb;
 
-	if (b->cbidx == MC_BATCH)
+	if (b->cbidx == MC_BATCH) {
+		mc_stats_flush(FL_CALLBACKS);
 		xen_mc_flush();
+	}
 
 	cb = &b->callbacks[b->cbidx++];
 	cb->fn = fn;
 	cb->data = data;
 }
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mc_debug;
+
+static int __init xen_mc_debugfs(void)
+{
+	struct dentry *d_xen = xen_init_debugfs();
+
+	if (d_xen == NULL)
+		return -ENOMEM;
+
+	d_mc_debug = debugfs_create_dir("multicalls", d_xen);
+
+	debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
+
+	debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
+	debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
+	debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
+
+	xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
+				     mc_stats.histo, MC_BATCH);
+	xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
+				     mc_stats.histo_hypercalls, NHYPERCALLS);
+	xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
+				     mc_stats.flush, FL_N_REASONS);
+
+	return 0;
+}
+fs_initcall(xen_mc_debugfs);
+
+#endif	/* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 4884bc603aa..0d8f3b2d9be 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -4,6 +4,8 @@
  */
 #include <linux/kernel_stat.h>
 #include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
 
 #include <asm/paravirt.h>
 
@@ -11,6 +13,93 @@
 #include <xen/events.h>
 
 #include "xen-ops.h"
+#include "debugfs.h"
+
+#ifdef CONFIG_XEN_DEBUG_FS
+static struct xen_spinlock_stats
+{
+	u64 taken;
+	u32 taken_slow;
+	u32 taken_slow_nested;
+	u32 taken_slow_pickup;
+	u32 taken_slow_spurious;
+
+	u64 released;
+	u32 released_slow;
+	u32 released_slow_kicked;
+
+#define HISTO_BUCKETS	20
+	u32 histo_spin_fast[HISTO_BUCKETS+1];
+	u32 histo_spin[HISTO_BUCKETS+1];
+
+	u64 spinning_time;
+	u64 total_time;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static unsigned lock_timeout = 1 << 10;
+#define TIMEOUT lock_timeout
+
+static inline void check_zero(void)
+{
+	if (unlikely(zero_stats)) {
+		memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+		zero_stats = 0;
+	}
+}
+
+#define ADD_STATS(elem, val)			\
+	do { check_zero(); spinlock_stats.elem += (val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+	return xen_clocksource_read();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+	unsigned index = ilog2(delta);
+
+	check_zero();
+
+	if (index < HISTO_BUCKETS)
+		array[index]++;
+	else
+		array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_fast(u64 start)
+{
+	u32 delta = xen_clocksource_read() - start;
+
+	__spin_time_accum(delta, spinlock_stats.histo_spin_fast);
+	spinlock_stats.spinning_time += delta;
+}
+
+static inline void spin_time_accum(u64 start)
+{
+	u32 delta = xen_clocksource_read() - start;
+
+	__spin_time_accum(delta, spinlock_stats.histo_spin);
+	spinlock_stats.total_time += delta;
+}
+#else  /* !CONFIG_XEN_DEBUG_FS */
+#define TIMEOUT			(1 << 10)
+#define ADD_STATS(elem, val)	do { (void)(val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+	return 0;
+}
+
+static inline void spin_time_accum_fast(u64 start)
+{
+}
+static inline void spin_time_accum(u64 start)
+{
+}
+#endif  /* CONFIG_XEN_DEBUG_FS */
 
 struct xen_spinlock {
 	unsigned char lock;		/* 0 -> free; 1 -> locked */
@@ -92,6 +181,9 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 	/* announce we're spinning */
 	prev = spinning_lock(xl);
 
+	ADD_STATS(taken_slow, 1);
+	ADD_STATS(taken_slow_nested, prev != NULL);
+
 	do {
 		/* clear pending */
 		xen_clear_irq_pending(irq);
@@ -100,6 +192,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 		   we weren't looking  */
 		ret = xen_spin_trylock(lock);
 		if (ret) {
+			ADD_STATS(taken_slow_pickup, 1);
+
 			/*
 			 * If we interrupted another spinlock while it
 			 * was blocking, make sure it doesn't block
@@ -120,6 +214,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 		 * pending.
 		 */
 		xen_poll_irq(irq);
+		ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
 	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
 
 	kstat_this_cpu.irqs[irq]++;
@@ -132,11 +227,18 @@ out:
 static void xen_spin_lock(struct raw_spinlock *lock)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-	int timeout;
+	unsigned timeout;
 	u8 oldval;
+	u64 start_spin;
+
+	ADD_STATS(taken, 1);
+
+	start_spin = spin_time_start();
 
 	do {
-		timeout = 1 << 10;
+		u64 start_spin_fast = spin_time_start();
+
+		timeout = TIMEOUT;
 
 		asm("1: xchgb %1,%0\n"
 		    "   testb %1,%1\n"
@@ -151,16 +253,22 @@ static void xen_spin_lock(struct raw_spinlock *lock)
 		    : "1" (1)
 		    : "memory");
 
-	} while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
+		spin_time_accum_fast(start_spin_fast);
+	} while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock))));
+
+	spin_time_accum(start_spin);
 }
 
 static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
 {
 	int cpu;
 
+	ADD_STATS(released_slow, 1);
+
 	for_each_online_cpu(cpu) {
 		/* XXX should mix up next cpu selection */
 		if (per_cpu(lock_spinners, cpu) == xl) {
+			ADD_STATS(released_slow_kicked, 1);
 			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 			break;
 		}
@@ -171,6 +279,8 @@ static void xen_spin_unlock(struct raw_spinlock *lock)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
 
+	ADD_STATS(released, 1);
+
 	smp_wmb();		/* make sure no writes get moved after unlock */
 	xl->lock = 0;		/* release lock */
 
@@ -216,3 +326,52 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.spin_trylock = xen_spin_trylock;
 	pv_lock_ops.spin_unlock = xen_spin_unlock;
 }
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_spin_debug;
+
+static int __init xen_spinlock_debugfs(void)
+{
+	struct dentry *d_xen = xen_init_debugfs();
+
+	if (d_xen == NULL)
+		return -ENOMEM;
+
+	d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
+
+	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+	debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
+
+	debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
+	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow);
+	debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow_nested);
+	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow_pickup);
+	debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow_spurious);
+
+	debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
+	debugfs_create_u32("released_slow", 0444, d_spin_debug,
+			   &spinlock_stats.released_slow);
+	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+			   &spinlock_stats.released_slow_kicked);
+
+	debugfs_create_u64("time_spinning", 0444, d_spin_debug,
+			   &spinlock_stats.spinning_time);
+	debugfs_create_u64("time_total", 0444, d_spin_debug,
+			   &spinlock_stats.total_time);
+
+	xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
+				     spinlock_stats.histo_spin, HISTO_BUCKETS + 1);
+	xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
+				     spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1);
+
+	return 0;
+}
+fs_initcall(xen_spinlock_debugfs);
+
+#endif	/* CONFIG_XEN_DEBUG_FS */
-- 
cgit v1.2.3


From 1e696f638b523958ee3d95e655becdb4c4b6fcde Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 20 Aug 2008 17:02:20 -0700
Subject: xen: allow interrupts to be enabled while doing a blocking spin

If spin_lock is called in an interrupts-enabled context, we can safely
enable interrupts while spinning.  We don't bother for the actual spin
loop, but if we timeout and fall back to blocking, it's definitely
worthwhile enabling interrupts if possible.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/spinlock.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0d8f3b2d9be..e6061f2e64f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -23,6 +23,7 @@ static struct xen_spinlock_stats
 	u32 taken_slow_nested;
 	u32 taken_slow_pickup;
 	u32 taken_slow_spurious;
+	u32 taken_slow_irqenable;
 
 	u64 released;
 	u32 released_slow;
@@ -167,12 +168,13 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
 	__get_cpu_var(lock_spinners) = prev;
 }
 
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
 	struct xen_spinlock *prev;
 	int irq = __get_cpu_var(lock_kicker_irq);
 	int ret;
+	unsigned long flags;
 
 	/* If kicker interrupts not initialized yet, just spin */
 	if (irq == -1)
@@ -181,6 +183,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 	/* announce we're spinning */
 	prev = spinning_lock(xl);
 
+	flags = __raw_local_save_flags();
+	if (irq_enable) {
+		ADD_STATS(taken_slow_irqenable, 1);
+		raw_local_irq_enable();
+	}
+
 	ADD_STATS(taken_slow, 1);
 	ADD_STATS(taken_slow_nested, prev != NULL);
 
@@ -220,11 +228,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
 	kstat_this_cpu.irqs[irq]++;
 
 out:
+	raw_local_irq_restore(flags);
 	unspinning_lock(xl, prev);
 	return ret;
 }
 
-static void xen_spin_lock(struct raw_spinlock *lock)
+static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
 {
 	struct xen_spinlock *xl = (struct xen_spinlock *)lock;
 	unsigned timeout;
@@ -254,11 +263,23 @@ static void xen_spin_lock(struct raw_spinlock *lock)
 		    : "memory");
 
 		spin_time_accum_fast(start_spin_fast);
-	} while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock))));
+
+	} while (unlikely(oldval != 0 &&
+			  (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
 
 	spin_time_accum(start_spin);
 }
 
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+	__xen_spin_lock(lock, false);
+}
+
+static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+{
+	__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
+}
+
 static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
 {
 	int cpu;
@@ -323,6 +344,7 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
 	pv_lock_ops.spin_is_contended = xen_spin_is_contended;
 	pv_lock_ops.spin_lock = xen_spin_lock;
+	pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
 	pv_lock_ops.spin_trylock = xen_spin_trylock;
 	pv_lock_ops.spin_unlock = xen_spin_unlock;
 }
@@ -353,6 +375,8 @@ static int __init xen_spinlock_debugfs(void)
 			   &spinlock_stats.taken_slow_pickup);
 	debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
 			   &spinlock_stats.taken_slow_spurious);
+	debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
+			   &spinlock_stats.taken_slow_irqenable);
 
 	debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
 	debugfs_create_u32("released_slow", 0444, d_spin_debug,
-- 
cgit v1.2.3


From f8eca41f0afcc7042dc3398c8ba5878c59b1cf1b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 20 Aug 2008 17:02:21 -0700
Subject: xen: measure how long spinlocks spend blocking

Measure how long spinlocks spend blocked.  Also rename some fields to
be more consistent.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/spinlock.c | 62 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 20 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e6061f2e64f..d072823bc06 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -29,12 +29,14 @@ static struct xen_spinlock_stats
 	u32 released_slow;
 	u32 released_slow_kicked;
 
-#define HISTO_BUCKETS	20
-	u32 histo_spin_fast[HISTO_BUCKETS+1];
-	u32 histo_spin[HISTO_BUCKETS+1];
-
-	u64 spinning_time;
-	u64 total_time;
+#define HISTO_BUCKETS	30
+	u32 histo_spin_total[HISTO_BUCKETS+1];
+	u32 histo_spin_spinning[HISTO_BUCKETS+1];
+	u32 histo_spin_blocked[HISTO_BUCKETS+1];
+
+	u64 time_total;
+	u64 time_spinning;
+	u64 time_blocked;
 } spinlock_stats;
 
 static u8 zero_stats;
@@ -70,20 +72,28 @@ static void __spin_time_accum(u64 delta, u32 *array)
 		array[HISTO_BUCKETS]++;
 }
 
-static inline void spin_time_accum_fast(u64 start)
+static inline void spin_time_accum_spinning(u64 start)
+{
+	u32 delta = xen_clocksource_read() - start;
+
+	__spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
+	spinlock_stats.time_spinning += delta;
+}
+
+static inline void spin_time_accum_total(u64 start)
 {
 	u32 delta = xen_clocksource_read() - start;
 
-	__spin_time_accum(delta, spinlock_stats.histo_spin_fast);
-	spinlock_stats.spinning_time += delta;
+	__spin_time_accum(delta, spinlock_stats.histo_spin_total);
+	spinlock_stats.time_total += delta;
 }
 
-static inline void spin_time_accum(u64 start)
+static inline void spin_time_accum_blocked(u64 start)
 {
 	u32 delta = xen_clocksource_read() - start;
 
-	__spin_time_accum(delta, spinlock_stats.histo_spin);
-	spinlock_stats.total_time += delta;
+	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+	spinlock_stats.time_blocked += delta;
 }
 #else  /* !CONFIG_XEN_DEBUG_FS */
 #define TIMEOUT			(1 << 10)
@@ -94,10 +104,13 @@ static inline u64 spin_time_start(void)
 	return 0;
 }
 
-static inline void spin_time_accum_fast(u64 start)
+static inline void spin_time_accum_total(u64 start)
+{
+}
+static inline void spin_time_accum_spinning(u64 start)
 {
 }
-static inline void spin_time_accum(u64 start)
+static inline void spin_time_accum_blocked(u64 start)
 {
 }
 #endif  /* CONFIG_XEN_DEBUG_FS */
@@ -175,11 +188,14 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
 	int irq = __get_cpu_var(lock_kicker_irq);
 	int ret;
 	unsigned long flags;
+	u64 start;
 
 	/* If kicker interrupts not initialized yet, just spin */
 	if (irq == -1)
 		return 0;
 
+	start = spin_time_start();
+
 	/* announce we're spinning */
 	prev = spinning_lock(xl);
 
@@ -230,6 +246,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
 out:
 	raw_local_irq_restore(flags);
 	unspinning_lock(xl, prev);
+	spin_time_accum_blocked(start);
+
 	return ret;
 }
 
@@ -262,12 +280,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
 		    : "1" (1)
 		    : "memory");
 
-		spin_time_accum_fast(start_spin_fast);
+		spin_time_accum_spinning(start_spin_fast);
 
 	} while (unlikely(oldval != 0 &&
 			  (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
 
-	spin_time_accum(start_spin);
+	spin_time_accum_total(start_spin);
 }
 
 static void xen_spin_lock(struct raw_spinlock *lock)
@@ -385,14 +403,18 @@ static int __init xen_spinlock_debugfs(void)
 			   &spinlock_stats.released_slow_kicked);
 
 	debugfs_create_u64("time_spinning", 0444, d_spin_debug,
-			   &spinlock_stats.spinning_time);
+			   &spinlock_stats.time_spinning);
+	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+			   &spinlock_stats.time_blocked);
 	debugfs_create_u64("time_total", 0444, d_spin_debug,
-			   &spinlock_stats.total_time);
+			   &spinlock_stats.time_total);
 
 	xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
-				     spinlock_stats.histo_spin, HISTO_BUCKETS + 1);
+				     spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
 	xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
-				     spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1);
+				     spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
+	xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+				     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
 
 	return 0;
 }
-- 
cgit v1.2.3


From f86399396ce7a4f4069828b7dceac5aa5113dfb5 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Wed, 30 Jul 2008 18:32:27 -0300
Subject: x86, paravirt_ops: use unsigned long instead of u32 for alloc_p*()
 pfn args

This patch changes the pfn args from 'u32' to 'unsigned long'
on alloc_p*() functions on paravirt_ops, and the corresponding
implementations for Xen and VMI. The prototypes for CONFIG_PARAVIRT=n
are already using unsigned long, so paravirt.h now matches the prototypes
on asm-x86/pgalloc.h.

It shouldn't result in any changes on generated code on 32-bit, with
or without CONFIG_PARAVIRT. On both cases, 'codiff -f' didn't show any
change after applying this patch.

On 64-bit, there are (expected) binary changes only when CONFIG_PARAVIRT
is enabled, as the patch is really supposed to change the size of the
pfn args.

[ v2: KVM_GUEST: use the right parameter type on kvm_release_pt() ]

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9ff6e3cbf08..db970bdc5e3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -812,7 +812,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 {
 #ifdef CONFIG_FLATMEM
 	BUG_ON(mem_map);	/* should only be used early */
@@ -822,7 +822,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
 
 /* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(u32 pfn)
+static void xen_release_pte_init(unsigned long pfn)
 {
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
@@ -838,7 +838,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 
 /* This needs to make sure the new pte page is pinned iff its being
    attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
 {
 	struct page *page = pfn_to_page(pfn);
 
@@ -856,12 +856,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
 	}
 }
 
-static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PTE);
 }
 
-static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PMD);
 }
@@ -909,7 +909,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 }
 
 /* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(u32 pfn, unsigned level)
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
 {
 	struct page *page = pfn_to_page(pfn);
 
@@ -923,23 +923,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
 	}
 }
 
-static void xen_release_pte(u32 pfn)
+static void xen_release_pte(unsigned long pfn)
 {
 	xen_release_ptpage(pfn, PT_PTE);
 }
 
-static void xen_release_pmd(u32 pfn)
+static void xen_release_pmd(unsigned long pfn)
 {
 	xen_release_ptpage(pfn, PT_PMD);
 }
 
 #if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PUD);
 }
 
-static void xen_release_pud(u32 pfn)
+static void xen_release_pud(unsigned long pfn)
 {
 	xen_release_ptpage(pfn, PT_PUD);
 }
-- 
cgit v1.2.3


From ee7686bc043921a488d9bf89fe93241c5457a74e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 21 Aug 2008 13:17:56 -0700
Subject: x86: build fix in "xen spinlock updates and performance measurements"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ingo Molnar wrote:
> -tip testing found this build failure:
>
>  arch/x86/xen/spinlock.c: In function ‘spin_time_start’:
>  arch/x86/xen/spinlock.c:60: error: implicit declaration of function ‘xen_clocksource_read’
>
> i've excluded these new commits for now from tip/master - could you
> please send a delta fix against tip/x86/xen?

Make xen_clocksource_read non-static.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/time.c    | 4 +---
 arch/x86/xen/xen-ops.h | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc..20182d9072c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
 #define TIMER_SLOP	100000
 #define NS_PER_TICK	(1000000000LL / HZ)
 
-static cycle_t xen_clocksource_read(void);
-
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
@@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void)
 	return xen_khz;
 }
 
-static cycle_t xen_clocksource_read(void)
+cycle_t xen_clocksource_read(void)
 {
         struct pvclock_vcpu_time_info *src;
 	cycle_t ret;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3c70ebc50b1..1e8bfdaa20d 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
 #define XEN_OPS_H
 
 #include <linux/init.h>
+#include <linux/clocksource.h>
 #include <linux/irqreturn.h>
 #include <xen/xen-ops.h>
 
@@ -33,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);
+cycle_t xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 unsigned long xen_tsc_khz(void);
 void __init xen_time_init(void);
-- 
cgit v1.2.3


From d68d82afd4c88e25763b23cd9cd4974573a3706f Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Fri, 22 Aug 2008 11:52:15 +0100
Subject: xen: implement CPU hotplugging

Note the changes from 2.6.18-xen CPU hotplugging:

A vcpu_down request from the remote admin via Xenbus both hotunplugs the
CPU, and disables it by removing it from the cpu_present map, and removing
its entry in /sys.

A vcpu_up request from the remote admin only re-enables the CPU, and does
not immediately bring the CPU up. A udev event is emitted, which can be
caught by the user if he wishes to automatically re-up CPUs when available,
or implement a more complex policy.

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/smp.c      | 60 +++++++++++++++++++++++++++++++++++++++----------
 arch/x86/xen/spinlock.c |  5 +++++
 arch/x86/xen/time.c     |  8 +++++++
 arch/x86/xen/xen-ops.h  |  6 +++++
 4 files changed, 67 insertions(+), 12 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index baca7f2fbd8..be5cbb2b7c6 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,8 +11,6 @@
  * useful topology information for the kernel to make use of.  As a
  * result, all CPUs are treated as if they're single-core and
  * single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
  */
 #include <linux/sched.h>
 #include <linux/err.h>
@@ -61,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static __cpuinit void cpu_bringup_and_idle(void)
+static __cpuinit void cpu_bringup(void)
 {
 	int cpu = smp_processor_id();
 
 	cpu_init();
+	touch_softlockup_watchdog();
 	preempt_disable();
 
 	xen_enable_sysenter();
@@ -86,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
 	local_irq_enable();
 
 	wmb();			/* make sure everything is out */
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+	cpu_bringup();
 	cpu_idle();
 }
 
@@ -209,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 
 		cpu_set(cpu, cpu_present_map);
 	}
-
-	//init_xenbus_allowed_cpumask();
 }
 
 static __cpuinit int
@@ -278,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	struct task_struct *idle = idle_task(cpu);
 	int rc;
 
-#if 0
-	rc = cpu_up_check(cpu);
-	if (rc)
-		return rc;
-#endif
-
 #ifdef CONFIG_X86_64
 	/* Allocate node local memory for AP pdas */
 	WARN_ON(cpu == 0);
@@ -336,6 +332,42 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
 {
 }
 
+int xen_cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+	if (cpu == 0)
+		return -EBUSY;
+
+	cpu_disable_common();
+
+	load_cr3(swapper_pg_dir);
+	return 0;
+}
+
+void xen_cpu_die(unsigned int cpu)
+{
+	while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+	unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+	xen_uninit_lock_cpu(cpu);
+	xen_teardown_timer(cpu);
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(0);
+}
+
+void xen_play_dead(void)
+{
+	play_dead_common();
+	HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+	cpu_bringup();
+}
+
 static void stop_self(void *v)
 {
 	int cpu = smp_processor_id();
@@ -419,9 +451,13 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 static const struct smp_ops xen_smp_ops __initdata = {
 	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = xen_smp_prepare_cpus,
-	.cpu_up = xen_cpu_up,
 	.smp_cpus_done = xen_smp_cpus_done,
 
+	.cpu_up = xen_cpu_up,
+	.cpu_die = xen_cpu_die,
+	.cpu_disable = xen_cpu_disable,
+	.play_dead = xen_play_dead,
+
 	.smp_send_stop = xen_smp_send_stop,
 	.smp_send_reschedule = xen_smp_send_reschedule,
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d072823bc06..dd71e3a021c 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -357,6 +357,11 @@ void __cpuinit xen_init_lock_cpu(int cpu)
 	printk("cpu %d spinlock event irq %d\n", cpu, irq);
 }
 
+void xen_uninit_lock_cpu(int cpu)
+{
+	unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+}
+
 void __init xen_init_spinlocks(void)
 {
 	pv_lock_ops.spin_is_locked = xen_spin_is_locked;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 20182d9072c..004ba86326a 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -450,6 +450,14 @@ void xen_setup_timer(int cpu)
 	setup_runstate_info(cpu);
 }
 
+void xen_teardown_timer(int cpu)
+{
+	struct clock_event_device *evt;
+	BUG_ON(cpu == 0);
+	evt = &per_cpu(xen_clock_events, cpu);
+	unbind_from_irqhandler(evt->irq, NULL);
+}
+
 void xen_setup_cpu_clockevents(void)
 {
 	BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1e8bfdaa20d..8dbd97fd7f1 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -34,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);
+void xen_teardown_timer(int cpu);
 cycle_t xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 unsigned long xen_tsc_khz(void);
@@ -50,11 +51,16 @@ void xen_mark_init_mm_pinned(void);
 
 void __init xen_setup_vcpu_info_placement(void);
 
+void xen_play_dead(void);
+void xen_cpu_die(unsigned int cpu);
+int xen_cpu_disable(void);
+
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
 
 void __init xen_init_spinlocks(void);
 __cpuinit void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
 
 extern cpumask_t xen_cpu_initialized_map;
 #else
-- 
cgit v1.2.3


From 2737146b3aa2cf8b5d5ae87a18c49fe1c374528b Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 8 Sep 2008 13:43:33 +0100
Subject: x86, xen: fix build when !CONFIG_HOTPLUG_CPU

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/smp.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index be5cbb2b7c6..bf51a466d62 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -332,6 +332,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
 {
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 int xen_cpu_disable(void)
 {
 	unsigned int cpu = smp_processor_id();
@@ -368,6 +369,23 @@ void xen_play_dead(void)
 	cpu_bringup();
 }
 
+#else /* !CONFIG_HOTPLUG_CPU */
+int xen_cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+void xen_cpu_die(unsigned int cpu)
+{
+	BUG();
+}
+
+void xen_play_dead(void)
+{
+	BUG();
+}
+
+#endif
 static void stop_self(void *v)
 {
 	int cpu = smp_processor_id();
-- 
cgit v1.2.3


From 26fd10517e810dd59ea050b052de24a75ee6dc07 Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 8 Sep 2008 13:43:34 +0100
Subject: xen: make CPU hotplug functions static

There's no need for these functions to be accessed from outside of xen/smp.c

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/smp.c     | 12 ++++++------
 arch/x86/xen/xen-ops.h |  4 ----
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index bf51a466d62..d77da613b1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -333,7 +333,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-int xen_cpu_disable(void)
+static int xen_cpu_disable(void)
 {
 	unsigned int cpu = smp_processor_id();
 	if (cpu == 0)
@@ -345,7 +345,7 @@ int xen_cpu_disable(void)
 	return 0;
 }
 
-void xen_cpu_die(unsigned int cpu)
+static void xen_cpu_die(unsigned int cpu)
 {
 	while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
 		current->state = TASK_UNINTERRUPTIBLE;
@@ -362,7 +362,7 @@ void xen_cpu_die(unsigned int cpu)
 		alternatives_smp_switch(0);
 }
 
-void xen_play_dead(void)
+static void xen_play_dead(void)
 {
 	play_dead_common();
 	HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
@@ -370,17 +370,17 @@ void xen_play_dead(void)
 }
 
 #else /* !CONFIG_HOTPLUG_CPU */
-int xen_cpu_disable(void)
+static int xen_cpu_disable(void)
 {
 	return -ENOSYS;
 }
 
-void xen_cpu_die(unsigned int cpu)
+static void xen_cpu_die(unsigned int cpu)
 {
 	BUG();
 }
 
-void xen_play_dead(void)
+static void xen_play_dead(void)
 {
 	BUG();
 }
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8dbd97fd7f1..d7422dc2a55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -51,10 +51,6 @@ void xen_mark_init_mm_pinned(void);
 
 void __init xen_setup_vcpu_info_placement(void);
 
-void xen_play_dead(void);
-void xen_cpu_die(unsigned int cpu);
-int xen_cpu_disable(void);
-
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
 
-- 
cgit v1.2.3


From f7d0b926ac8c8ec0c7a83ee69409bd2e6bb39f81 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 9 Sep 2008 15:43:22 -0700
Subject: mm: define USE_SPLIT_PTLOCKS rather than repeating expression

Define USE_SPLIT_PTLOCKS as a constant expression rather than repeating
"NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS" all over the place.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da69..2e1b6408849 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -646,7 +646,7 @@ static spinlock_t *lock_pte(struct page *page)
 {
 	spinlock_t *ptl = NULL;
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if USE_SPLIT_PTLOCKS
 	ptl = __pte_lockptr(page);
 	spin_lock(ptl);
 #endif
-- 
cgit v1.2.3


From 6a9e91846bf52cc70a0417de19fdfac224c435c4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 9 Sep 2008 15:43:25 -0700
Subject: xen: fix pinning when not using split pte locks

We only pin PTE pages when using split PTE locks, so don't do the
pin/unpin when attaching/detaching pte pages to a pinned pagetable.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b106e825d26..8ca2f88bde1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -826,7 +826,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
 
 		if (!PageHighMem(page)) {
 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
-			if (level == PT_PTE)
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
 				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 		} else
 			/* make sure there are no stray mappings of
@@ -894,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
 
 	if (PagePinned(page)) {
 		if (!PageHighMem(page)) {
-			if (level == PT_PTE)
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
 				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 		}
-- 
cgit v1.2.3


From 08115ab4d98cb577a83971ebd57cdfbcc6f50b68 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Mon, 29 Sep 2008 18:24:23 -0400
Subject: xen: make CONFIG_XEN_SAVE_RESTORE depend on CONFIG_XEN

Xen options need to depend on XEN.
Also, add newline at end of file.

Without this patch you need to disable CONFIG_PM in order to
disable CPU hotplugging.

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Acked-by Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index d3e68465ace..87b9ab16642 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,7 +26,7 @@ config XEN_MAX_DOMAIN_MEMORY
 
 config XEN_SAVE_RESTORE
        bool
-       depends on PM
+       depends on XEN && PM
        default y
 
 config XEN_DEBUG_FS
@@ -35,4 +35,4 @@ config XEN_DEBUG_FS
 	default n
 	help
 	  Enable statistics output and various tuning options in debugfs.
-	  Enabling this option may incur a significant performance overhead.
\ No newline at end of file
+	  Enabling this option may incur a significant performance overhead.
-- 
cgit v1.2.3


From db053b86f4b1ec790da2dafe2acb93be76288bb9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 2 Oct 2008 16:41:31 -0700
Subject: xen: clean up x86-64 warnings

There are a couple of Xen features which rely on directly accessing
per-cpu data via a segment register, which is not yet available on
x86-64.  In the meantime, just disable direct access to the vcpu info
structure; this leaves some of the code as dead, but it will come to
life in time, and the warnings are suppressed.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c  | 61 ++++++++---------------------------------------
 arch/x86/xen/xen-asm_64.S | 20 +++++++++++++---
 2 files changed, 27 insertions(+), 54 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8ca2f88bde1..85692c9f649 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -112,7 +112,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  *
  * 0: not available, 1: available
  */
-static int have_vcpu_info_placement = 1;
+static int have_vcpu_info_placement =
+#ifdef CONFIG_X86_32
+	1
+#else
+	0
+#endif
+	;
+
 
 static void xen_vcpu_setup(int cpu)
 {
@@ -941,6 +948,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
+#ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -959,6 +967,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 
 	xen_set_pte(ptep, pte);
 }
+#endif
 
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
@@ -1025,7 +1034,6 @@ void xen_setup_vcpu_info_placement(void)
 
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
-#ifdef CONFIG_X86_32
 	if (have_vcpu_info_placement) {
 		printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
@@ -1035,7 +1043,6 @@ void xen_setup_vcpu_info_placement(void)
 		pv_irq_ops.irq_enable = xen_irq_enable_direct;
 		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
 	}
-#endif
 }
 
 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1056,12 +1063,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	goto patch_site
 
 	switch (type) {
-#ifdef CONFIG_X86_32
 		SITE(pv_irq_ops, irq_enable);
 		SITE(pv_irq_ops, irq_disable);
 		SITE(pv_irq_ops, save_fl);
 		SITE(pv_irq_ops, restore_fl);
-#endif /* CONFIG_X86_32 */
 #undef SITE
 
 	patch_site:
@@ -1399,48 +1404,11 @@ static void *m2v(phys_addr_t maddr)
 	return __ka(m2p(maddr));
 }
 
-#ifdef CONFIG_X86_64
-static void walk(pgd_t *pgd, unsigned long addr)
-{
-	unsigned l4idx = pgd_index(addr);
-	unsigned l3idx = pud_index(addr);
-	unsigned l2idx = pmd_index(addr);
-	unsigned l1idx = pte_index(addr);
-	pgd_t l4;
-	pud_t l3;
-	pmd_t l2;
-	pte_t l1;
-
-	xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
-		       pgd, addr, l4idx, l3idx, l2idx, l1idx);
-
-	l4 = pgd[l4idx];
-	xen_raw_printk("  l4: %016lx\n", l4.pgd);
-	xen_raw_printk("      %016lx\n", pgd_val(l4));
-
-	l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
-	xen_raw_printk("  l3: %016lx\n", l3.pud);
-	xen_raw_printk("      %016lx\n", pud_val(l3));
-
-	l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
-	xen_raw_printk("  l2: %016lx\n", l2.pmd);
-	xen_raw_printk("      %016lx\n", pmd_val(l2));
-
-	l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
-	xen_raw_printk("  l1: %016lx\n", l1.pte);
-	xen_raw_printk("      %016lx\n", pte_val(l1));
-}
-#endif
-
 static void set_page_prot(void *addr, pgprot_t prot)
 {
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
 
-	xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
-		       addr, pfn, get_phys_to_machine(pfn),
-		       pgprot_val(prot), pte.pte);
-
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
 		BUG();
 }
@@ -1698,15 +1666,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_raw_console_write("about to get started...\n");
 
-#if 0
-	xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
-		       &boot_params, __pa_symbol(&boot_params),
-		       __va(__pa_symbol(&boot_params)));
-
-	walk(pgd, &boot_params);
-	walk(pgd, __va(__pa(&boot_params)));
-#endif
-
 	/* Start the world */
 #ifdef CONFIG_X86_32
 	i386_start_kernel();
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 3b9bda46487..05794c566e8 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
 /* Pseudo-flag used for virtual NMI, which we don't implement yet */
 #define XEN_EFLAGS_NMI	0x80000000
 
-#if 0
-#include <asm/percpu.h>
+#if 1
+/*
+	x86-64 does not yet support direct access to percpu variables
+	via a segment override, so we just need to make sure this code
+	never gets used
+ */
+#define BUG			ud2a
+#define PER_CPU_VAR(var, off)	0xdeadbeef
+#endif
 
 /*
 	Enable events.  This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
 	events, then enter the hypervisor to get them handled.
  */
 ENTRY(xen_irq_enable_direct)
+	BUG
+
 	/* Unmask events */
 	movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
 
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
 	non-zero.
  */
 ENTRY(xen_irq_disable_direct)
+	BUG
+
 	movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
 ENDPATCH(xen_irq_disable_direct)
 	ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
 	Xen and x86 use opposite senses (mask vs enable).
  */
 ENTRY(xen_save_fl_direct)
+	BUG
+
 	testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
 	setz %ah
 	addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
 	if so.
  */
 ENTRY(xen_restore_fl_direct)
+	BUG
+
 	testb $X86_EFLAGS_IF>>8, %ah
 	setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
 	/* Preempt here doesn't matter because that will deal with
@@ -133,7 +148,6 @@ check_events:
 	pop %rcx
 	pop %rax
 	ret
-#endif
 
 ENTRY(xen_adjust_exception_frame)
 	mov 8+0(%rsp),%rcx
-- 
cgit v1.2.3


From eefb47f6a1e855653d275cb90592a3587ea93a09 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 8 Oct 2008 13:01:39 -0700
Subject: xen: use spin_lock_nest_lock when pinning a pagetable

When pinning/unpinning a pagetable with split pte locks, we can end up
holding multiple pte locks at once (we need to hold the locks while
there's a pending batched hypercall affecting the pte page).  Because
all the pte locks are in the same lock class, lockdep thinks that
we're potentially taking a lock recursively.

This warning is spurious because we always take the pte locks while
holding mm->page_table_lock.  lockdep now has spin_lock_nest_lock to
express this kind of dominant lock use, so use it here so that lockdep
knows what's going on.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/mmu.c | 74 +++++++++++++++++++++++++++++++++++-------------------
 arch/x86/xen/mmu.h |  3 ---
 2 files changed, 48 insertions(+), 29 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 64e58681767..ae173f6edd8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -651,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
  * For 64-bit, we must skip the Xen hole in the middle of the address
  * space, just after the big x86-64 virtual hole.
  */
-static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
+static int xen_pgd_walk(struct mm_struct *mm,
+			int (*func)(struct mm_struct *mm, struct page *,
+				    enum pt_level),
 			unsigned long limit)
 {
+	pgd_t *pgd = mm->pgd;
 	int flush = 0;
 	unsigned hole_low, hole_high;
 	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -698,7 +701,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 		pud = pud_offset(&pgd[pgdidx], 0);
 
 		if (PTRS_PER_PUD > 1) /* not folded */
-			flush |= (*func)(virt_to_page(pud), PT_PUD);
+			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 
 		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 			pmd_t *pmd;
@@ -713,7 +716,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 			pmd = pmd_offset(&pud[pudidx], 0);
 
 			if (PTRS_PER_PMD > 1) /* not folded */
-				flush |= (*func)(virt_to_page(pmd), PT_PMD);
+				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 
 			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 				struct page *pte;
@@ -727,7 +730,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 					continue;
 
 				pte = pmd_page(pmd[pmdidx]);
-				flush |= (*func)(pte, PT_PTE);
+				flush |= (*func)(mm, pte, PT_PTE);
 			}
 		}
 	}
@@ -735,20 +738,20 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
 out:
 	/* Do the top level last, so that the callbacks can use it as
 	   a cue to do final things like tlb flushes. */
-	flush |= (*func)(virt_to_page(pgd), PT_PGD);
+	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 
 	return flush;
 }
 
 /* If we're using split pte locks, then take the page's lock and
    return a pointer to it.  Otherwise return NULL. */
-static spinlock_t *xen_pte_lock(struct page *page)
+static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 {
 	spinlock_t *ptl = NULL;
 
 #if USE_SPLIT_PTLOCKS
 	ptl = __pte_lockptr(page);
-	spin_lock(ptl);
+	spin_lock_nest_lock(ptl, &mm->page_table_lock);
 #endif
 
 	return ptl;
@@ -772,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 }
 
-static int xen_pin_page(struct page *page, enum pt_level level)
+static int xen_pin_page(struct mm_struct *mm, struct page *page,
+			enum pt_level level)
 {
 	unsigned pgfl = TestSetPagePinned(page);
 	int flush;
@@ -813,7 +817,7 @@ static int xen_pin_page(struct page *page, enum pt_level level)
 		 */
 		ptl = NULL;
 		if (level == PT_PTE)
-			ptl = xen_pte_lock(page);
+			ptl = xen_pte_lock(page, mm);
 
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL_RO),
@@ -834,11 +838,11 @@ static int xen_pin_page(struct page *page, enum pt_level level)
 /* This is called just after a mm has been created, but it has not
    been used yet.  We need to make sure that its pagetable is all
    read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
+static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
 	xen_mc_batch();
 
-	if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) {
+	if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
 		/* re-enable interrupts for kmap_flush_unused */
 		xen_mc_issue(0);
 		kmap_flush_unused();
@@ -852,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 
 		if (user_pgd) {
-			xen_pin_page(virt_to_page(user_pgd), PT_PGD);
+			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 			xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
 		}
 	}
 #else /* CONFIG_X86_32 */
 #ifdef CONFIG_X86_PAE
 	/* Need to make sure unshared kernel PMD is pinnable */
-	xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+	xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+		     PT_PMD);
 #endif
 	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 #endif /* CONFIG_X86_64 */
 	xen_mc_issue(0);
 }
 
+static void xen_pgd_pin(struct mm_struct *mm)
+{
+	__xen_pgd_pin(mm, mm->pgd);
+}
+
 /*
  * On save, we need to pin all pagetables to make sure they get their
  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
  * them (unpinned pgds are not currently in use, probably because the
  * process is under construction or destruction).
+ *
+ * Expected to be called in stop_machine() ("equivalent to taking
+ * every spinlock in the system"), so the locking doesn't really
+ * matter all that much.
  */
 void xen_mm_pin_all(void)
 {
@@ -881,7 +895,7 @@ void xen_mm_pin_all(void)
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
-			xen_pgd_pin((pgd_t *)page_address(page));
+			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 			SetPageSavePinned(page);
 		}
 	}
@@ -894,7 +908,8 @@ void xen_mm_pin_all(void)
  * that's before we have page structures to store the bits.  So do all
  * the book-keeping now.
  */
-static __init int xen_mark_pinned(struct page *page, enum pt_level level)
+static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+				  enum pt_level level)
 {
 	SetPagePinned(page);
 	return 0;
@@ -902,10 +917,11 @@ static __init int xen_mark_pinned(struct page *page, enum pt_level level)
 
 void __init xen_mark_init_mm_pinned(void)
 {
-	xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP);
+	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
 
-static int xen_unpin_page(struct page *page, enum pt_level level)
+static int xen_unpin_page(struct mm_struct *mm, struct page *page,
+			  enum pt_level level)
 {
 	unsigned pgfl = TestClearPagePinned(page);
 
@@ -923,7 +939,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level)
 		 * partially-pinned state.
 		 */
 		if (level == PT_PTE) {
-			ptl = xen_pte_lock(page);
+			ptl = xen_pte_lock(page, mm);
 
 			if (ptl)
 				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
@@ -945,7 +961,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level)
 }
 
 /* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
+static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 {
 	xen_mc_batch();
 
@@ -957,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
 		if (user_pgd) {
 			xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
-			xen_unpin_page(virt_to_page(user_pgd), PT_PGD);
+			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
 		}
 	}
 #endif
 
 #ifdef CONFIG_X86_PAE
 	/* Need to make sure unshared kernel PMD is unpinned */
-	xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+	xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+		       PT_PMD);
 #endif
 
-	xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT);
+	xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
 
 	xen_mc_issue(0);
 }
 
+static void xen_pgd_unpin(struct mm_struct *mm)
+{
+	__xen_pgd_unpin(mm, mm->pgd);
+}
+
 /*
  * On resume, undo any pinning done at save, so that the rest of the
  * kernel doesn't see any unexpected pinned pagetables.
@@ -986,7 +1008,7 @@ void xen_mm_unpin_all(void)
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
 			BUG_ON(!PagePinned(page));
-			xen_pgd_unpin((pgd_t *)page_address(page));
+			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
 			ClearPageSavePinned(page);
 		}
 	}
@@ -997,14 +1019,14 @@ void xen_mm_unpin_all(void)
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	spin_lock(&next->page_table_lock);
-	xen_pgd_pin(next->pgd);
+	xen_pgd_pin(next);
 	spin_unlock(&next->page_table_lock);
 }
 
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 	spin_lock(&mm->page_table_lock);
-	xen_pgd_pin(mm->pgd);
+	xen_pgd_pin(mm);
 	spin_unlock(&mm->page_table_lock);
 }
 
@@ -1095,7 +1117,7 @@ void xen_exit_mmap(struct mm_struct *mm)
 
 	/* pgd may not be pinned in the error exit path of execve */
 	if (xen_page_pinned(mm->pgd))
-		xen_pgd_unpin(mm->pgd);
+		xen_pgd_unpin(mm);
 
 	spin_unlock(&mm->page_table_lock);
 }
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e..98d71659da5 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
 void xen_exit_mmap(struct mm_struct *mm);
 
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
 pteval_t xen_pte_val(pte_t);
 pmdval_t xen_pmd_val(pmd_t);
 pgdval_t xen_pgd_val(pgd_t);
-- 
cgit v1.2.3


From 5dc64a3442b98eaa0e3730c35fcf00cf962a93e7 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Fri, 10 Oct 2008 11:27:38 +0100
Subject: xen: do not reserve 2 pages of padding between hypervisor and fixmap.

When reserving space for the hypervisor the Xen paravirt backend adds
an extra two pages (this was carried forward from the 2.6.18-xen tree
which had them "for safety"). Depending on various CONFIG options this
can cause the boot time fixmaps to span multiple PMDs which is not
supported and triggers a WARN in early_ioremap_init().

This was exposed by 2216d199b1430d1c0affb1498a9ebdbd9c0de439 which
moved the dmi table parsing earlier.
    x86: fix CONFIG_X86_RESERVE_LOW_64K=y

    The bad_bios_dmi_table() quirk never triggered because we do DMI setup
    too late. Move it a bit earlier.

There is no real reason to reserve these two extra pages and the
fixmap already incorporates FIX_HOLE which serves the same
purpose. None of the other callers of reserve_top_address do this.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 85692c9f649..977a54255fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1370,7 +1370,7 @@ static void __init xen_reserve_top(void)
 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
 		top = pp.virt_start;
 
-	reserve_top_address(-top + 2 * PAGE_SIZE);
+	reserve_top_address(-top);
 #endif	/* CONFIG_X86_32 */
 }
 
-- 
cgit v1.2.3


From 3807f345b2c610336c17c7624a0d496a38df75a0 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 28 Jul 2008 11:47:52 -0300
Subject: x86: paravirt: factor out cpu_khz to common code

KVM intends to use paravirt code to calibrate khz. Xen
current code will do just fine. So as a first step, factor out
code to pvclock.c.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/xen/time.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86326a..c9f7cda48ed 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-	u64 xen_khz = 1000000ULL << 32;
-	const struct pvclock_vcpu_time_info *info =
+	struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
 
-	do_div(xen_khz, info->tsc_to_system_mul);
-	if (info->tsc_shift < 0)
-		xen_khz <<= -info->tsc_shift;
-	else
-		xen_khz >>= info->tsc_shift;
-
-	return xen_khz;
+	return pvclock_tsc_khz(info);
 }
 
 cycle_t xen_clocksource_read(void)
-- 
cgit v1.2.3