From 30dd568c912602b7dbd609a45d053e01b13422bb Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 21 Jul 2009 15:56:48 +0200
Subject: x86, perf_counter, bts: Add BTS support to perfcounters

Implement a performance counter with:

    attr.type           = PERF_TYPE_HARDWARE
    attr.config         = PERF_COUNT_HW_BRANCH_INSTRUCTIONS
    attr.sample_period  = 1

Using branch trace store (BTS) on x86 hardware, if available.

The from and to address for each branch can be sampled using:

    PERF_SAMPLE_IP      for the from address
    PERF_SAMPLE_ADDR    for the to address

[ v2: address review feedback, fix bugs ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |  10 ++
 arch/x86/kernel/cpu/perf_counter.c  | 325 +++++++++++++++++++++++++++++++++++-
 2 files changed, 329 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index fa64e401589..e7b7c938ae2 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,16 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed counter range, since lower
+ * values are used by actual fixed counters and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(void);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a7aa8f90095..b237c181aa4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2009 Jaswinder Singh Rajput
  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -20,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
+#include <linux/cpu.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -27,12 +29,52 @@
 
 static u64 perf_counter_mask __read_mostly;
 
+/* The maximal number of PEBS counters: */
+#define MAX_PEBS_COUNTERS	4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 1024)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 64)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_counter_reset[MAX_PEBS_COUNTERS];
+};
+
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	int			enabled;
+	struct debug_store	*ds;
 };
 
 /*
@@ -57,6 +99,8 @@ struct x86_pmu {
 	u64		counter_mask;
 	u64		max_period;
 	u64		intel_ctrl;
+	void		(*enable_bts)(u64 config);
+	void		(*disable_bts)(void);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -576,6 +620,9 @@ x86_perf_counter_update(struct perf_counter *counter,
 	u64 prev_raw_count, new_raw_count;
 	s64 delta;
 
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -659,10 +706,109 @@ static void release_pmc_hardware(void)
 		enable_lapic_nmi_watchdog();
 }
 
+static inline bool bts_available(void)
+{
+	return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_counters, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+	int cpu;
+
+	if (!bts_available())
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_counters, cpu).ds = NULL;
+
+		kfree((void *)(long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+	int cpu, err = 0;
+
+	if (!bts_available())
+		return -EOPNOTSUPP;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+
+		err = -ENOMEM;
+		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+		if (unlikely(!buffer))
+			break;
+
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+
+		ds->bts_buffer_base = (u64)(long)buffer;
+		ds->bts_index = ds->bts_buffer_base;
+		ds->bts_absolute_maximum =
+			ds->bts_buffer_base + BTS_BUFFER_SIZE;
+		ds->bts_interrupt_threshold =
+			ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+		per_cpu(cpu_hw_counters, cpu).ds = ds;
+		err = 0;
+	}
+
+	if (err)
+		release_bts_hardware();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
 	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
+		release_bts_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -705,6 +851,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
 	return 0;
 }
 
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -721,9 +903,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	err = 0;
 	if (!atomic_inc_not_zero(&active_counters)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
-			err = -EBUSY;
-		else
+		if (atomic_read(&active_counters) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				reserve_bts_hardware();
+		}
+		if (!err)
 			atomic_inc(&active_counters);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
@@ -801,7 +987,18 @@ static void p6_pmu_disable_all(void)
 
 static void intel_pmu_disable_all(void)
 {
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		intel_pmu_disable_bts();
 }
 
 static void amd_pmu_disable_all(void)
@@ -859,7 +1056,25 @@ static void p6_pmu_enable_all(void)
 
 static void intel_pmu_enable_all(void)
 {
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		struct perf_counter *counter =
+			cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+
+		if (WARN_ON_ONCE(!counter))
+			return;
+
+		intel_pmu_enable_bts(counter->hw.config);
+	}
 }
 
 static void amd_pmu_enable_all(void)
@@ -946,6 +1161,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void
 intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		intel_pmu_disable_bts();
+		return;
+	}
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_disable_fixed(hwc, idx);
 		return;
@@ -974,6 +1194,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	s64 period = hwc->sample_period;
 	int err, ret = 0;
 
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -1056,6 +1279,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 
 static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		if (!__get_cpu_var(cpu_hw_counters).enabled)
+			return;
+
+		intel_pmu_enable_bts(hwc->config);
+		return;
+	}
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_enable_fixed(hwc, idx);
 		return;
@@ -1077,11 +1308,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely((event ==
+		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+		     (hwc->sample_period == 1)))
+		return X86_PMC_IDX_FIXED_BTS;
+
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1102,7 +1338,25 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	int idx;
 
 	idx = fixed_mode_idx(counter, hwc);
-	if (idx >= 0) {
+	if (idx == X86_PMC_IDX_FIXED_BTS) {
+		/*
+		 * Try to use BTS for branch tracing. If that is not
+		 * available, try to get a generic counter.
+		 */
+		if (unlikely(!cpuc->ds))
+			goto try_generic;
+
+		/*
+		 * Try to get the fixed counter, if that is already taken
+		 * then try to get a generic counter:
+		 */
+		if (test_and_set_bit(idx, cpuc->used_mask))
+			goto try_generic;
+
+		hwc->config_base	= 0;
+		hwc->counter_base	= 0;
+		hwc->idx		= idx;
+	} else if (idx >= 0) {
 		/*
 		 * Try to get the fixed counter, if that is already taken
 		 * then try to get a generic counter:
@@ -1213,6 +1467,45 @@ void perf_counter_print_debug(void)
 	local_irq_restore(flags);
 }
 
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
+				       struct perf_sample_data *data)
+{
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+	unsigned long orig_ip = data->regs->ip;
+	u64 at;
+
+	if (!counter)
+		return;
+
+	if (!ds)
+		return;
+
+	for (at = ds->bts_buffer_base;
+	     at < ds->bts_index;
+	     at += sizeof(struct bts_record)) {
+		struct bts_record *rec = (struct bts_record *)(long)at;
+
+		data->regs->ip	= rec->from;
+		data->addr	= rec->to;
+
+		perf_counter_output(counter, 1, data);
+	}
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	data->regs->ip	= orig_ip;
+	data->addr	= 0;
+
+	/* There's new data available. */
+	counter->pending_kill = POLL_IN;
+}
+
 static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1237,6 +1530,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * that we are disabling:
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
+
+	/* Drain the remaining BTS records. */
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		struct perf_sample_data data;
+		struct pt_regs regs;
+
+		data.regs = &regs;
+		intel_pmu_drain_bts_buffer(cpuc, &data);
+	}
 	cpuc->counters[idx] = NULL;
 	clear_bit(idx, cpuc->used_mask);
 
@@ -1264,6 +1566,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
 
 static void intel_pmu_reset(void)
 {
+	struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
 	unsigned long flags;
 	int idx;
 
@@ -1281,6 +1584,8 @@ static void intel_pmu_reset(void)
 	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
 	}
+	if (ds)
+		ds->bts_index = ds->bts_buffer_base;
 
 	local_irq_restore(flags);
 }
@@ -1346,6 +1651,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	perf_disable();
+	intel_pmu_drain_bts_buffer(cpuc, &data);
 	status = intel_pmu_get_status();
 	if (!status) {
 		perf_enable();
@@ -1547,6 +1853,8 @@ static struct x86_pmu intel_pmu = {
 	 * the generic counter period:
 	 */
 	.max_period		= (1ULL << 31) - 1,
+	.enable_bts		= intel_pmu_enable_bts,
+	.disable_bts		= intel_pmu_disable_bts,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -1936,3 +2244,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 
 	return entry;
 }
+
+void hw_perf_counter_setup_online(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+}
-- 
cgit v1.2.3


From 1660e9d3d04b6c636b7171bf6c08ac7b82a7de79 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 17 Aug 2009 14:36:32 +1000
Subject: powerpc/32: Always order writes to halves of 64-bit PTEs

On 32-bit systems with 64-bit PTEs, the PTEs have to be written in two
32-bit halves.  On SMP we write the higher-order half and then the
lower-order half, with a write barrier between the two halves, but on
UP there was no particular ordering of the writes to the two halves.

This extends the ordering that we already do on SMP to the UP case as
well.  The reason is that with the perf_counter subsystem potentially
accessing user memory at interrupt time to get stack traces, we have
to be careful not to create an incorrect but apparently valid PTE even
on UP.

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/pgtable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index eb17da78112..2a5da069714 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	else
 		pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
 
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
-	/* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+	/* Second case is 32-bit with 64-bit PTE.  In this case, we
 	 * can just store as long as we do the two halves in the right order
 	 * with a barrier in between. This is possible because we take care,
 	 * in the hash code, to pre-invalidate if the PTE was already hashed,
@@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 
 #else
 	/* Anything else just stores the PTE normally. That covers all 64-bit
-	 * cases, and 32-bit non-hash with 64-bit PTEs in UP mode
+	 * cases, and 32-bit non-hash with 32-bit PTEs.
 	 */
 	*ptep = pte;
 #endif
-- 
cgit v1.2.3


From 9c1e105238c474d19905af504f2e7f42d4f71f9e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 17 Aug 2009 15:17:54 +1000
Subject: powerpc: Allow perf_counters to access user memory at interrupt time

This provides a mechanism to allow the perf_counters code to access
user memory in a PMU interrupt routine.  Such an access can cause
various kinds of interrupt: SLB miss, MMU hash table miss, segment
table miss, or TLB miss, depending on the processor.  This commit
only deals with 64-bit classic/server processors, which use an MMU
hash table.  32-bit processors are already able to access user memory
at interrupt time.  Since we don't soft-disable on 32-bit, we avoid
the possibility of reentering hash_page or the TLB miss handlers,
since they run with interrupts disabled.

On 64-bit processors, an SLB miss interrupt on a user address will
update the slb_cache and slb_cache_ptr fields in the paca.  This is
OK except in the case where a PMU interrupt occurs in switch_slb,
which also accesses those fields.  To prevent this, we hard-disable
interrupts in switch_slb.  Interrupts are already soft-disabled at
this point, and will get hard-enabled when they get soft-enabled
later.

This also reworks slb_flush_and_rebolt: to avoid hard-disabling twice,
and to make sure that it clears the slb_cache_ptr when called from
other callers than switch_slb, the existing routine is renamed to
__slb_flush_and_rebolt, which is called by switch_slb and the new
version of slb_flush_and_rebolt.

Similarly, switch_stab (used on POWER3 and RS64 processors) gets a
hard_irq_disable() to protect the per-cpu variables used there and
in ste_allocate.

If a MMU hashtable miss interrupt occurs, normally we would call
hash_page to look up the Linux PTE for the address and create a HPTE.
However, hash_page is fairly complex and takes some locks, so to
avoid the possibility of deadlock, we check the preemption count
to see if we are in a (pseudo-)NMI handler, and if so, we don't call
hash_page but instead treat it like a bad access that will get
reported up through the exception table mechanism.  An interrupt
whose handler runs even though the interrupt occurred when
soft-disabled (such as the PMU interrupt) is considered a pseudo-NMI
handler, which should use nmi_enter()/nmi_exit() rather than
irq_enter()/irq_exit().

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/asm-offsets.c    |  2 ++
 arch/powerpc/kernel/exceptions-64s.S | 19 ++++++++++++++++++
 arch/powerpc/mm/slb.c                | 37 +++++++++++++++++++++++++-----------
 arch/powerpc/mm/stab.c               | 11 ++++++++++-
 4 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 561b6465231..197b15646ee 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -67,6 +67,8 @@ int main(void)
 	DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
 #ifdef CONFIG_PPC64
 	DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
+	DEFINE(SIGSEGV, SIGSEGV);
+	DEFINE(NMI_MASK, NMI_MASK);
 #else
 	DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index eb898112e57..8ac85e08ffa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -729,6 +729,11 @@ BEGIN_FTR_SECTION
 	bne-	do_ste_alloc		/* If so handle it */
 END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 
+	clrrdi	r11,r1,THREAD_SHIFT
+	lwz	r0,TI_PREEMPT(r11)	/* If we're in an "NMI" */
+	andis.	r0,r0,NMI_MASK@h	/* (i.e. an irq when soft-disabled) */
+	bne	77f			/* then don't call hash_page now */
+
 	/*
 	 * On iSeries, we soft-disable interrupts here, then
 	 * hard-enable interrupts so that the hash_page code can spin on
@@ -833,6 +838,20 @@ handle_page_fault:
 	bl	.low_hash_fault
 	b	.ret_from_except
 
+/*
+ * We come here as a result of a DSI at a point where we don't want
+ * to call hash_page, such as when we are accessing memory (possibly
+ * user memory) inside a PMU interrupt that occurred while interrupts
+ * were soft-disabled.  We want to invoke the exception handler for
+ * the access, or panic if there isn't a handler.
+ */
+77:	bl	.save_nvgprs
+	mr	r4,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	li	r5,SIGSEGV
+	bl	.bad_page_fault
+	b	.ret_from_except
+
 	/* here we have a segment miss */
 do_ste_alloc:
 	bl	.ste_allocate		/* try to insert stab entry */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b..a685652effe 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
 		     : "memory" );
 }
 
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
 {
 	/* If you change this make sure you change SLB_NUM_BOLTED
 	 * appropriately too. */
 	unsigned long linear_llp, vmalloc_llp, lflags, vflags;
 	unsigned long ksp_esid_data, ksp_vsid_data;
 
-	WARN_ON(!irqs_disabled());
-
 	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
 	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
 	lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
 		ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
 	}
 
-	/*
-	 * We can't take a PMU exception in the following code, so hard
-	 * disable interrupts.
-	 */
-	hard_irq_disable();
-
 	/* We need to do this all in asm, so we're sure we don't touch
 	 * the stack between the slbia and rebolting it. */
 	asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
 		     : "memory");
 }
 
+void slb_flush_and_rebolt(void)
+{
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * We can't take a PMU exception in the following code, so hard
+	 * disable interrupts.
+	 */
+	hard_irq_disable();
+
+	__slb_flush_and_rebolt();
+	get_paca()->slb_cache_ptr = 0;
+}
+
 void slb_vmalloc_update(void)
 {
 	unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-	unsigned long offset = get_paca()->slb_cache_ptr;
+	unsigned long offset;
 	unsigned long slbie_data = 0;
 	unsigned long pc = KSTK_EIP(tsk);
 	unsigned long stack = KSTK_ESP(tsk);
 	unsigned long unmapped_base;
 
+	/*
+	 * We need interrupts hard-disabled here, not just soft-disabled,
+	 * so that a PMU interrupt can't occur, which might try to access
+	 * user memory (to get a stack trace) and possible cause an SLB miss
+	 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+	 */
+	hard_irq_disable();
+	offset = get_paca()->slb_cache_ptr;
 	if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
 	    offset <= SLB_CACHE_ENTRIES) {
 		int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		}
 		asm volatile("isync" : : : "memory");
 	} else {
-		slb_flush_and_rebolt();
+		__slb_flush_and_rebolt();
 	}
 
 	/* Workaround POWER5 < DD2.1 issue */
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae7..ab5fb48b3e9 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
 	struct stab_entry *ste;
-	unsigned long offset = __get_cpu_var(stab_cache_ptr);
+	unsigned long offset;
 	unsigned long pc = KSTK_EIP(tsk);
 	unsigned long stack = KSTK_ESP(tsk);
 	unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 	/* Force previous translations to complete. DRENG */
 	asm volatile("isync" : : : "memory");
 
+	/*
+	 * We need interrupts hard-disabled here, not just soft-disabled,
+	 * so that a PMU interrupt can't occur, which might try to access
+	 * user memory (to get a stack trace) and possible cause an STAB miss
+	 * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+	 */
+	hard_irq_disable();
+
+	offset = __get_cpu_var(stab_cache_ptr);
 	if (offset <= NR_STAB_CACHE_ENTRIES) {
 		int i;
 
-- 
cgit v1.2.3


From 20002ded4d937ca87aca6253b874920a96a763c4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 18 Aug 2009 08:25:32 +1000
Subject: perf_counter: powerpc: Add callchain support

This adds support for tracing callchains for powerpc, both 32-bit
and 64-bit, and both in the kernel and userspace, from PMU interrupt
context.

The first three entries stored for each callchain are the NIP (next
instruction pointer), LR (link register), and the contents of the LR
save area in the second stack frame (the first is ignored because the
ABI convention on powerpc is that functions save their return address
in their caller's stack frame).  Because leaf functions don't have to
save their return address (LR value) and don't have to establish a
stack frame, it's possible for either or both of LR and the second
stack frame's LR save area to have valid return addresses in them.
This is basically impossible to disambiguate without either reading
the code or looking at auxiliary information such as CFI tables.
Since we don't want to do either of those things at interrupt time,
we store both LR and the second stack frame's LR save area.

Once we get past the second stack frame, there is no ambiguity; all
return addresses we get are reliable.

For kernel traces, we check whether they are valid kernel instruction
addresses and store zero instead if they are not (rather than
omitting them, which would make it impossible for userspace to know
which was which).  We also store zero instead of the second stack
frame's LR save area value if it is the same as LR.

For kernel traces, we check for interrupt frames, and for user traces,
we check for signal frames.  In each case, since we're starting a new
trace, we store a PERF_CONTEXT_KERNEL/USER marker so that userspace
knows that the next three entries are NIP, LR and the second stack frame
for the interrupted context.

We read user memory with __get_user_inatomic.  On 64-bit, if this
PMU interrupt occurred while interrupts are soft-disabled, and
there is no MMU hash table entry for the page, we will get an
-EFAULT return from __get_user_inatomic even if there is a valid
Linux PTE for the page, since hash_page isn't reentrant.  Thus we
have code here to read the Linux PTE and access the page via the
kernel linear mapping.  Since 64-bit doesn't use (or need) highmem
there is no need to do kmap_atomic.  On 32-bit, we don't do soft
interrupt disabling, so this complication doesn't occur and there
is no need to fall back to reading the Linux PTE, since hash_page
(or the TLB miss handler) will get called automatically if necessary.

Note that we cannot get PMU interrupts in the interval during
context switch between switch_mm (which switches the user address
space) and switch_to (which actually changes current to the new
process).  On 64-bit this is because interrupts are hard-disabled
in switch_mm and stay hard-disabled until they are soft-enabled
later, after switch_to has returned.  So there is no possibility
of trying to do a user stack trace when the user address space is
not current's address space.

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile         |   2 +-
 arch/powerpc/kernel/perf_callchain.c | 527 +++++++++++++++++++++++++++++++++++
 2 files changed, 528 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/perf_callchain.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b73396b9390..9619285f64e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o
+obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o perf_callchain.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
new file mode 100644
index 00000000000..f74b62c6751
--- /dev/null
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -0,0 +1,527 @@
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#ifdef CONFIG_PPC64
+#include "ppc32.h"
+#endif
+
+/*
+ * Store another value in a callchain_entry.
+ */
+static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+	unsigned int nr = entry->nr;
+
+	if (nr < PERF_MAX_STACK_DEPTH) {
+		entry->ip[nr] = ip;
+		entry->nr = nr + 1;
+	}
+}
+
+/*
+ * Is sp valid as the address of the next kernel stack frame after prev_sp?
+ * The next frame may be in a different stack area but should not go
+ * back down in the same stack area.
+ */
+static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
+{
+	if (sp & 0xf)
+		return 0;		/* must be 16-byte aligned */
+	if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+		return 0;
+	if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
+		return 1;
+	/*
+	 * sp could decrease when we jump off an interrupt stack
+	 * back to the regular process stack.
+	 */
+	if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
+		return 1;
+	return 0;
+}
+
+static void perf_callchain_kernel(struct pt_regs *regs,
+				  struct perf_callchain_entry *entry)
+{
+	unsigned long sp, next_sp;
+	unsigned long next_ip;
+	unsigned long lr;
+	long level = 0;
+	unsigned long *fp;
+
+	lr = regs->link;
+	sp = regs->gpr[1];
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
+	callchain_store(entry, regs->nip);
+
+	if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+		return;
+
+	for (;;) {
+		fp = (unsigned long *) sp;
+		next_sp = fp[0];
+
+		if (next_sp == sp + STACK_INT_FRAME_SIZE &&
+		    fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+			/*
+			 * This looks like an interrupt frame for an
+			 * interrupt that occurred in the kernel
+			 */
+			regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
+			next_ip = regs->nip;
+			lr = regs->link;
+			level = 0;
+			callchain_store(entry, PERF_CONTEXT_KERNEL);
+
+		} else {
+			if (level == 0)
+				next_ip = lr;
+			else
+				next_ip = fp[STACK_FRAME_LR_SAVE];
+
+			/*
+			 * We can't tell which of the first two addresses
+			 * we get are valid, but we can filter out the
+			 * obviously bogus ones here.  We replace them
+			 * with 0 rather than removing them entirely so
+			 * that userspace can tell which is which.
+			 */
+			if ((level == 1 && next_ip == lr) ||
+			    (level <= 1 && !kernel_text_address(next_ip)))
+				next_ip = 0;
+
+			++level;
+		}
+
+		callchain_store(entry, next_ip);
+		if (!valid_next_sp(next_sp, sp))
+			return;
+		sp = next_sp;
+	}
+}
+
+#ifdef CONFIG_PPC64
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define is_huge_psize(pagesize)	(HPAGE_SHIFT && mmu_huge_psizes[pagesize])
+#else
+#define is_huge_psize(pagesize)	0
+#endif
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+{
+	pgd_t *pgdir;
+	pte_t *ptep, pte;
+	int pagesize;
+	unsigned long addr = (unsigned long) ptr;
+	unsigned long offset;
+	unsigned long pfn;
+	void *kaddr;
+
+	pgdir = current->mm->pgd;
+	if (!pgdir)
+		return -EFAULT;
+
+	pagesize = get_slice_psize(current->mm, addr);
+
+	/* align address to page boundary */
+	offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+	addr -= offset;
+
+	if (is_huge_psize(pagesize))
+		ptep = huge_pte_offset(current->mm, addr);
+	else
+		ptep = find_linux_pte(pgdir, addr);
+
+	if (ptep == NULL)
+		return -EFAULT;
+	pte = *ptep;
+	if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+		return -EFAULT;
+	pfn = pte_pfn(pte);
+	if (!page_is_ram(pfn))
+		return -EFAULT;
+
+	/* no highmem to worry about here */
+	kaddr = pfn_to_kaddr(pfn);
+	memcpy(ret, kaddr + offset, nb);
+	return 0;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+	if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+	    ((unsigned long)ptr & 7))
+		return -EFAULT;
+
+	if (!__get_user_inatomic(*ret, ptr))
+		return 0;
+
+	return read_user_stack_slow(ptr, ret, 8);
+}
+
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+	if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+	    ((unsigned long)ptr & 3))
+		return -EFAULT;
+
+	if (!__get_user_inatomic(*ret, ptr))
+		return 0;
+
+	return read_user_stack_slow(ptr, ret, 4);
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+	if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
+		return 0;
+	return 1;
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+	char		dummy[__SIGNAL_FRAMESIZE];
+	struct ucontext	uc;
+	unsigned long	unused[2];
+	unsigned int	tramp[6];
+	struct siginfo	*pinfo;
+	void		*puc;
+	struct siginfo	info;
+	char		abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+	if (nip == fp + offsetof(struct signal_frame_64, tramp))
+		return 1;
+	if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+	    nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+		return 1;
+	return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+	struct signal_frame_64 __user *sf;
+	unsigned long pinfo, puc;
+
+	sf = (struct signal_frame_64 __user *) sp;
+	if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+	    read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+		return 0;
+	return pinfo == (unsigned long) &sf->info &&
+		puc == (unsigned long) &sf->uc;
+}
+
+static void perf_callchain_user_64(struct pt_regs *regs,
+				   struct perf_callchain_entry *entry)
+{
+	unsigned long sp, next_sp;
+	unsigned long next_ip;
+	unsigned long lr;
+	long level = 0;
+	struct signal_frame_64 __user *sigframe;
+	unsigned long __user *fp, *uregs;
+
+	next_ip = regs->nip;
+	lr = regs->link;
+	sp = regs->gpr[1];
+	callchain_store(entry, PERF_CONTEXT_USER);
+	callchain_store(entry, next_ip);
+
+	for (;;) {
+		fp = (unsigned long __user *) sp;
+		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
+			return;
+		if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+			return;
+
+		/*
+		 * Note: the next_sp - sp >= signal frame size check
+		 * is true when next_sp < sp, which can happen when
+		 * transitioning from an alternate signal stack to the
+		 * normal stack.
+		 */
+		if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+		    (is_sigreturn_64_address(next_ip, sp) ||
+		     (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+		    sane_signal_64_frame(sp)) {
+			/*
+			 * This looks like an signal frame
+			 */
+			sigframe = (struct signal_frame_64 __user *) sp;
+			uregs = sigframe->uc.uc_mcontext.gp_regs;
+			if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+			    read_user_stack_64(&uregs[PT_LNK], &lr) ||
+			    read_user_stack_64(&uregs[PT_R1], &sp))
+				return;
+			level = 0;
+			callchain_store(entry, PERF_CONTEXT_USER);
+			callchain_store(entry, next_ip);
+			continue;
+		}
+
+		if (level == 0)
+			next_ip = lr;
+		callchain_store(entry, next_ip);
+		++level;
+		sp = next_sp;
+	}
+}
+
+static inline int current_is_64bit(void)
+{
+	/*
+	 * We can't use test_thread_flag() here because we may be on an
+	 * interrupt stack, and the thread flags don't get copied over
+	 * from the thread_info on the main stack to the interrupt stack.
+	 */
+	return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
+}
+
+#else  /* CONFIG_PPC64 */
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables.  Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+	if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+	    ((unsigned long)ptr & 3))
+		return -EFAULT;
+
+	return __get_user_inatomic(*ret, ptr);
+}
+
+static inline void perf_callchain_user_64(struct pt_regs *regs,
+					  struct perf_callchain_entry *entry)
+{
+}
+
+static inline int current_is_64bit(void)
+{
+	return 0;
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+	if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
+		return 0;
+	return 1;
+}
+
+#define __SIGNAL_FRAMESIZE32	__SIGNAL_FRAMESIZE
+#define sigcontext32		sigcontext
+#define mcontext32		mcontext
+#define ucontext32		ucontext
+#define compat_siginfo_t	struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+	char			dummy[__SIGNAL_FRAMESIZE32];
+	struct sigcontext32	sctx;
+	struct mcontext32	mctx;
+	int			abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+	char			dummy[__SIGNAL_FRAMESIZE32 + 16];
+	compat_siginfo_t	info;
+	struct ucontext32	uc;
+	int			abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+	if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+		return 1;
+	if (vdso32_sigtramp && current->mm->context.vdso_base &&
+	    nip == current->mm->context.vdso_base + vdso32_sigtramp)
+		return 1;
+	return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+	if (nip == fp + offsetof(struct rt_signal_frame_32,
+				 uc.uc_mcontext.mc_pad))
+		return 1;
+	if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+	    nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+		return 1;
+	return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+	struct signal_frame_32 __user *sf;
+	unsigned int regs;
+
+	sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+	if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+		return 0;
+	return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+	struct rt_signal_frame_32 __user *sf;
+	unsigned int regs;
+
+	sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+	if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+		return 0;
+	return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+				unsigned int next_sp, unsigned int next_ip)
+{
+	struct mcontext32 __user *mctx = NULL;
+	struct signal_frame_32 __user *sf;
+	struct rt_signal_frame_32 __user *rt_sf;
+
+	/*
+	 * Note: the next_sp - sp >= signal frame size check
+	 * is true when next_sp < sp, for example, when
+	 * transitioning from an alternate signal stack to the
+	 * normal stack.
+	 */
+	if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+	    is_sigreturn_32_address(next_ip, sp) &&
+	    sane_signal_32_frame(sp)) {
+		sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+		mctx = &sf->mctx;
+	}
+
+	if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+	    is_rt_sigreturn_32_address(next_ip, sp) &&
+	    sane_rt_signal_32_frame(sp)) {
+		rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+		mctx = &rt_sf->uc.uc_mcontext;
+	}
+
+	if (!mctx)
+		return NULL;
+	return mctx->mc_gregs;
+}
+
+static void perf_callchain_user_32(struct pt_regs *regs,
+				   struct perf_callchain_entry *entry)
+{
+	unsigned int sp, next_sp;
+	unsigned int next_ip;
+	unsigned int lr;
+	long level = 0;
+	unsigned int __user *fp, *uregs;
+
+	next_ip = regs->nip;
+	lr = regs->link;
+	sp = regs->gpr[1];
+	callchain_store(entry, PERF_CONTEXT_USER);
+	callchain_store(entry, next_ip);
+
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		fp = (unsigned int __user *) (unsigned long) sp;
+		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
+			return;
+		if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+			return;
+
+		uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+		if (!uregs && level <= 1)
+			uregs = signal_frame_32_regs(sp, next_sp, lr);
+		if (uregs) {
+			/*
+			 * This looks like an signal frame, so restart
+			 * the stack trace with the values in it.
+			 */
+			if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+			    read_user_stack_32(&uregs[PT_LNK], &lr) ||
+			    read_user_stack_32(&uregs[PT_R1], &sp))
+				return;
+			level = 0;
+			callchain_store(entry, PERF_CONTEXT_USER);
+			callchain_store(entry, next_ip);
+			continue;
+		}
+
+		if (level == 0)
+			next_ip = lr;
+		callchain_store(entry, next_ip);
+		++level;
+		sp = next_sp;
+	}
+}
+
+/*
+ * Since we can't get PMU interrupts inside a PMU interrupt handler,
+ * we don't need separate irq and nmi entries here.
+ */
+static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
+
+	entry->nr = 0;
+
+	if (current->pid == 0)		/* idle task? */
+		return entry;
+
+	if (!user_mode(regs)) {
+		perf_callchain_kernel(regs, entry);
+		if (current->mm)
+			regs = task_pt_regs(current);
+		else
+			regs = NULL;
+	}
+
+	if (regs) {
+		if (current_is_64bit())
+			perf_callchain_user_64(regs, entry);
+		else
+			perf_callchain_user_32(regs, entry);
+	}
+
+	return entry;
+}
-- 
cgit v1.2.3


From 747b50aaf728987732e6ff3ba10aba4acc4e0277 Mon Sep 17 00:00:00 2001
From: "markus.t.metzger@intel.com" <markus.t.metzger@intel.com>
Date: Wed, 2 Sep 2009 16:04:46 +0200
Subject: x86, perf_counter, bts: Fail if BTS is not available

Reserve PERF_COUNT_HW_BRANCH_INSTRUCTIONS with sample_period ==
1 for BTS tracing and fail, if BTS is not available.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090902140612.943801000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 396e35db705..2f41874ffb8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -769,7 +769,7 @@ static int reserve_bts_hardware(void)
 	int cpu, err = 0;
 
 	if (!bts_available())
-		return -EOPNOTSUPP;
+		return 0;
 
 	get_online_cpus();
 
@@ -914,7 +914,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
 			else
-				reserve_bts_hardware();
+				err = reserve_bts_hardware();
 		}
 		if (!err)
 			atomic_inc(&active_counters);
@@ -979,6 +979,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (config == -1LL)
 		return -EINVAL;
 
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1) && !bts_available())
+		return -EOPNOTSUPP;
+
 	hwc->config |= config;
 
 	return 0;
@@ -1355,19 +1362,9 @@ static int x86_pmu_enable(struct perf_counter *counter)
 
 	idx = fixed_mode_idx(counter, hwc);
 	if (idx == X86_PMC_IDX_FIXED_BTS) {
-		/*
-		 * Try to use BTS for branch tracing. If that is not
-		 * available, try to get a generic counter.
-		 */
-		if (unlikely(!cpuc->ds))
-			goto try_generic;
-
-		/*
-		 * Try to get the fixed counter, if that is already taken
-		 * then try to get a generic counter:
-		 */
+		/* BTS is already occupied. */
 		if (test_and_set_bit(idx, cpuc->used_mask))
-			goto try_generic;
+			return -EAGAIN;
 
 		hwc->config_base	= 0;
 		hwc->counter_base	= 0;
-- 
cgit v1.2.3


From 596da17f94c103348ebe04129c00d536ea0e80e2 Mon Sep 17 00:00:00 2001
From: "markus.t.metzger@intel.com" <markus.t.metzger@intel.com>
Date: Wed, 2 Sep 2009 16:04:47 +0200
Subject: x86, perf_counter, bts: Correct pointer-to-u64 casts

On 32bit, pointers in the DS AREA configuration are cast to
u64. The current (long) cast to avoid compiler warnings results
in a signed 64bit address.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090902140615.305889000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2f41874ffb8..3776b0b630c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -726,7 +726,8 @@ static inline void init_debug_store_on_cpu(int cpu)
 		return;
 
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32));
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
 }
 
 static inline void fini_debug_store_on_cpu(int cpu)
@@ -757,7 +758,7 @@ static void release_bts_hardware(void)
 
 		per_cpu(cpu_hw_counters, cpu).ds = NULL;
 
-		kfree((void *)(long)ds->bts_buffer_base);
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
 		kfree(ds);
 	}
 
@@ -788,7 +789,7 @@ static int reserve_bts_hardware(void)
 			break;
 		}
 
-		ds->bts_buffer_base = (u64)(long)buffer;
+		ds->bts_buffer_base = (u64)(unsigned long)buffer;
 		ds->bts_index = ds->bts_buffer_base;
 		ds->bts_absolute_maximum =
 			ds->bts_buffer_base + BTS_BUFFER_SIZE;
@@ -1491,7 +1492,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 	};
 	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
 	unsigned long orig_ip = data->regs->ip;
-	u64 at;
+	struct bts_record *at, *top;
 
 	if (!counter)
 		return;
@@ -1499,19 +1500,18 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 	if (!ds)
 		return;
 
-	for (at = ds->bts_buffer_base;
-	     at < ds->bts_index;
-	     at += sizeof(struct bts_record)) {
-		struct bts_record *rec = (struct bts_record *)(long)at;
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
-		data->regs->ip	= rec->from;
-		data->addr	= rec->to;
+	ds->bts_index = ds->bts_buffer_base;
+
+	for (; at < top; at++) {
+		data->regs->ip	= at->from;
+		data->addr	= at->to;
 
 		perf_counter_output(counter, 1, data);
 	}
 
-	ds->bts_index = ds->bts_buffer_base;
-
 	data->regs->ip	= orig_ip;
 	data->addr	= 0;
 
-- 
cgit v1.2.3


From 1653192f510bd8114b7b133d7289e6e5c3e95046 Mon Sep 17 00:00:00 2001
From: "markus.t.metzger@intel.com" <markus.t.metzger@intel.com>
Date: Wed, 2 Sep 2009 16:04:48 +0200
Subject: x86, perf_counter, bts: Do not allow kernel BTS tracing for now

Kernel BTS tracing generates too much data too fast for us to
handle, causing the kernel to hang.

Fail for BTS requests for kernel code.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Peter Zijlstra <a.p.zjilstra@chello.nl>
LKML-Reference: <20090902140616.901253000@intel.com>
[ This is really a workaround - but we want BTS tracing in .32
  so make sure we dont regress. The lockup should be fixed
  ASAP. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3776b0b630c..f9cd0849bd4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -984,8 +984,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Branch tracing:
 	 */
 	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1) && !bts_available())
-		return -EOPNOTSUPP;
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!bts_available())
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+			return -EOPNOTSUPP;
+	}
 
 	hwc->config |= config;
 
-- 
cgit v1.2.3