From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:39:53 +0100 Subject: performance counters: x86 support Implement performance counters for x86 Intel CPUs. It's simplified right now: the PERFMON CPU feature is assumed, which is available in Core2 and later Intel CPUs. The design is flexible to be extended to more CPU types as well. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S | 3 +- arch/x86/include/asm/hardirq_32.h | 1 + arch/x86/include/asm/hw_irq.h | 2 + arch/x86/include/asm/intel_arch_perfmon.h | 34 +- arch/x86/include/asm/irq_vectors.h | 5 + arch/x86/include/asm/mach-default/entry_arch.h | 5 + arch/x86/include/asm/pda.h | 1 + arch/x86/include/asm/thread_info.h | 4 +- arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 3 +- arch/x86/kernel/apic.c | 2 + arch/x86/kernel/cpu/Makefile | 12 +- arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/perf_counter.c | 571 +++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irqinit_32.c | 3 + arch/x86/kernel/irqinit_64.c | 5 + arch/x86/kernel/signal.c | 7 +- arch/x86/kernel/syscall_table_32.S | 1 + 21 files changed, 652 insertions(+), 21 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_counter.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4d4cb7629e..f2fdc186724 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -643,6 +643,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) + select HAVE_PERF_COUNTERS config X86_IO_APIC def_bool y diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 256b00b6189..3c14ed07dc4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -823,7 +823,8 @@ ia32_sys_call_table: .quad compat_sys_signalfd4 .quad sys_eventfd2 .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ + .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad sys_perf_counter_open ia32_syscall_end: diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index 5ca135e72f2..b3e475dc933 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -9,6 +9,7 @@ typedef struct { unsigned long idle_timestamp; unsigned int __nmi_count; /* arch dependent */ unsigned int apic_timer_irqs; /* arch dependent */ + unsigned int apic_perf_irqs; /* arch dependent */ unsigned int irq0_irqs; unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 8de644b6b95..aa93e53b85e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,8 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void error_interrupt(void); +extern void perf_counter_interrupt(void); + extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h index fa0fd068bc2..71598a9eab6 100644 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ b/arch/x86/include/asm/intel_arch_perfmon.h @@ -1,22 +1,24 @@ #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H #define _ASM_X86_INTEL_ARCH_PERFMON_H -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 union cpuid10_eax { struct { @@ -28,4 +30,12 @@ union cpuid10_eax { unsigned int full; }; +#ifdef CONFIG_PERF_COUNTERS +extern void init_hw_perf_counters(void); +extern void perf_counters_lapic_init(int nmi); +#else +static inline void init_hw_perf_counters(void) { } +static inline void perf_counters_lapic_init(int nmi) { } +#endif + #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..b8d277f1252 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,6 +86,11 @@ */ #define LOCAL_TIMER_VECTOR 0xef +/* + * Performance monitoring interrupt vector: + */ +#define LOCAL_PERF_VECTOR 0xee + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h index 6b1add8e31d..ad31e5d90e9 100644 --- a/arch/x86/include/asm/mach-default/entry_arch.h +++ b/arch/x86/include/asm/mach-default/entry_arch.h @@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) * a much simpler SMP time architecture: */ #ifdef CONFIG_X86_LOCAL_APIC + BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +#ifdef CONFIG_PERF_COUNTERS +BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +#endif + #ifdef CONFIG_X86_MCE_P4THERMAL BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h index 2fbfff88df3..90a8d9d4206 100644 --- a/arch/x86/include/asm/pda.h +++ b/arch/x86/include/asm/pda.h @@ -30,6 +30,7 @@ struct x8664_pda { short isidle; struct mm_struct *active_mm; unsigned apic_timer_irqs; + unsigned apic_perf_irqs; unsigned irq0_irqs; unsigned irq_resched_count; unsigned irq_call_count; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..810bf266d13 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,6 +80,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -103,6 +104,7 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -135,7 +137,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index f2bba78430a..7e47658b0a6 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -338,6 +338,7 @@ #define __NR_dup3 330 #define __NR_pipe2 331 #define __NR_inotify_init1 332 +#define __NR_perf_counter_open 333 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d2e415e6666..53025feaf88 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) __SYSCALL(__NR_pipe2, sys_pipe2) #define __NR_inotify_init1 294 __SYSCALL(__NR_inotify_init1, sys_inotify_init1) - +#define __NR_perf_counter_open 295 +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..8ab8c185867 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(0); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..89e53361fe2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # obj-y := intel_cacheinfo.o addon_cpuid_features.o @@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..4461011db47 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -750,6 +751,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..82440cbed0e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,571 @@ +/* + * Performance counter x86 architecture code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static bool perf_counters_initialized __read_mostly; + +/* + * Number of (generic) HW counters: + */ +static int nr_hw_counters __read_mostly; +static u32 perf_counter_mask __read_mostly; + +/* No support for fixed function counters yet */ + +#define MAX_HW_COUNTERS 8 + +struct cpu_hw_counters { + struct perf_counter *counters[MAX_HW_COUNTERS]; + unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + int enable_all; +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +const int intel_perfmon_event_map[] = +{ + [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_CACHE_MISSES] = 0x412e, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); + +/* + * Setup the hardware configuration for a given hw_event_type + */ +int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +{ + struct hw_perf_counter *hwc = &counter->hw; + + if (unlikely(!perf_counters_initialized)) + return -EINVAL; + + /* + * Count user events, and generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + + /* + * If privileged enough, count OS events too, and allow + * NMI events as well: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN)) { + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + if (hw_event_type & PERF_COUNT_NMI) + hwc->nmi = 1; + } + + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + + hwc->irq_period = counter->__irq_period; + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + if (!hwc->irq_period) + hwc->irq_period = 0x7FFFFFFF; + + hwc->next_count = -((s32) hwc->irq_period); + + /* + * Negative event types mean raw encoded event+umask values: + */ + if (hw_event_type < 0) { + counter->hw_event_type = -hw_event_type; + counter->hw_event_type &= ~PERF_COUNT_NMI; + } else { + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type >= max_intel_perfmon_events) + return -EINVAL; + /* + * The generic map: + */ + counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + } + hwc->config |= counter->hw_event_type; + counter->wakeup_pending = 0; + + return 0; +} + +static void __hw_perf_enable_all(void) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); +} + +void hw_perf_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 1; + __hw_perf_enable_all(); +} + +void hw_perf_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 0; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); +} + +static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + + wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +void hw_perf_counter_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* Try to get the previous counter again */ + if (test_and_set_bit(idx, cpuc->used)) { + idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + + perf_counters_lapic_init(hwc->nmi); + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + cpuc->counters[idx] = counter; + counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + __hw_perf_counter_enable(hwc, idx); +} + +#ifdef CONFIG_X86_64 +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} +#else +/* + * Todo: add proper atomic64_t support to 32-bit x86: + */ +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} +#endif + +static void __hw_perf_save_counter(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 raw = -1; + s64 delta; + int err; + + /* + * Get the raw hw counter value: + */ + err = rdmsrl_safe(hwc->counter_base + idx, &raw); + WARN_ON_ONCE(err); + + /* + * Rebase it to zero (it started counting at -irq_period), + * to see the delta since ->prev_count: + */ + delta = (s64)hwc->irq_period + (s64)(s32)raw; + + atomic64_counter_set(counter, hwc->prev_count + delta); + + /* + * Adjust the ->prev_count offset - if we went beyond + * irq_period of units, then we got an IRQ and the counter + * was set back to -irq_period: + */ + while (delta >= (s64)hwc->irq_period) { + hwc->prev_count += hwc->irq_period; + delta -= (s64)hwc->irq_period; + } + + /* + * Calculate the next raw counter value we'll write into + * the counter at the next sched-in time: + */ + delta -= (s64)hwc->irq_period; + + hwc->next_count = (s32)delta; +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + int cpu, err, idx; + + local_irq_disable(); + + cpu = smp_processor_id(); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); + WARN_ON_ONCE(err); + + printk(KERN_INFO "\n"); + printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); + printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); + printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + + for (idx = 0; idx < nr_hw_counters; idx++) { + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); + WARN_ON_ONCE(err); + + next_count = per_cpu(prev_next_count[idx], cpu); + + printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + cpu, idx, pmc_count); + printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", + cpu, idx, next_count); + } + local_irq_enable(); +} + +void hw_perf_counter_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + unsigned int idx = hwc->idx; + + counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(hwc->config_base + idx, hwc->config, 0); + + clear_bit(idx, cpuc->used); + cpuc->counters[idx] = NULL; + __hw_perf_save_counter(counter, hwc, idx); +} + +void hw_perf_counter_read(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + unsigned long addr = hwc->counter_base + hwc->idx; + s64 offs, val = -1LL; + s32 val32; + int err; + + /* Careful: NMI might modify the counter offset */ + do { + offs = hwc->prev_count; + err = rdmsrl_safe(addr, &val); + WARN_ON_ONCE(err); + } while (offs != hwc->prev_count); + + val32 = (s32) val; + val = (s64)hwc->irq_period + (s64)val32; + atomic64_counter_set(counter, hwc->prev_count + val); +} + +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_enable(hwc, idx); + } +} + +static void +perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +{ + struct perf_counter_context *ctx = leader->ctx; + struct perf_counter *counter; + int bit; + + list_for_each_entry(counter, &ctx->counters, list) { + if (counter->record_type != PERF_RECORD_SIMPLE || + counter == leader) + continue; + + if (counter->active) { + /* + * When counter was not in the overflow mask, we have to + * read it from hardware. We read it as well, when it + * has not been read yet and clear the bit in the + * status mask. + */ + bit = counter->hw.idx; + if (!test_bit(bit, (unsigned long *) overflown) || + test_bit(bit, (unsigned long *) status)) { + clear_bit(bit, (unsigned long *) status); + perf_save_and_restart(counter); + } + } + perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, atomic64_counter_read(counter)); + } +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +{ + int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) { + ack_APIC_irq(); + return; + } + + /* Disable counters globally */ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + ack_APIC_irq(); + + cpuc = &per_cpu(cpu_hw_counters, cpu); + +again: + ack = status; + for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!counter) + continue; + + perf_save_and_restart(counter); + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + continue; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + break; + case PERF_RECORD_GROUP: + perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + atomic64_counter_read(counter)); + perf_handle_group(counter, &status, &ack); + break; + } + /* + * From NMI context we cannot call into the scheduler to + * do a task wakeup - but we mark these counters as + * wakeup_pending and initate a wakeup callback: + */ + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else { + wake_up(&counter->waitq); + } + } + + wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + + /* + * Repeat if there is more work to be done: + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (status) + goto again; + + /* + * Do not reenable when global enable is off: + */ + if (cpuc->enable_all) + __hw_perf_enable_all(); +} + +void smp_perf_counter_interrupt(struct pt_regs *regs) +{ + irq_enter(); +#ifdef CONFIG_X86_64 + add_pda(apic_perf_irqs, 1); +#else + per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++; +#endif + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + __smp_perf_counter_interrupt(regs, 0); + + irq_exit(); +} + +/* + * This handler is triggered by NMI contexts: + */ +void perf_counter_notify(struct pt_regs *regs) +{ + struct cpu_hw_counters *cpuc; + unsigned long flags; + int bit, cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + for_each_bit(bit, cpuc->used, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + if (!counter) + continue; + + if (counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } + + local_irq_restore(flags); +} + +void __cpuinit perf_counters_lapic_init(int nmi) +{ + u32 apic_val; + + if (!perf_counters_initialized) + return; + /* + * Enable the performance counter vector in the APIC LVT: + */ + apic_val = apic_read(APIC_LVTERR); + + apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); + if (nmi) + apic_write(APIC_LVTPC, APIC_DM_NMI); + else + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + apic_write(APIC_LVTERR, apic_val); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (likely(cmd != DIE_NMI_IPI)) + return NOTIFY_DONE; + + regs = args->regs; + + apic_write(APIC_LVTPC, APIC_DM_NMI); + __smp_perf_counter_interrupt(regs, 1); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler +}; + +void __init init_hw_perf_counters(void) +{ + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return; + + printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); + + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + nr_hw_counters = eax.split.num_counters; + if (nr_hw_counters > MAX_HW_COUNTERS) { + nr_hw_counters = MAX_HW_COUNTERS; + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + nr_hw_counters, MAX_HW_COUNTERS); + } + perf_counter_mask = (1 << nr_hw_counters) - 1; + perf_max_counters = nr_hw_counters; + + printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + + perf_counters_lapic_init(0); + register_die_notifier(&perf_counter_nmi_notifier); + + perf_counters_initialized = true; +} diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3194636a429..fc013cfde30 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PERF_VECTOR \ + perf_counter_interrupt smp_perf_counter_interrupt +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..d92bc71e41a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + seq_printf(p, "CNT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); #endif #ifdef CONFIG_SMP seq_printf(p, "RES: "); @@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->apic_perf_irqs; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 607db63044a..6a33b5e3016 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -160,6 +160,9 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +# endif #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8670b3ce626..91d785c25ad 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -138,6 +138,11 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupt: */ +#ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +#endif } void __init native_init_IRQ(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da6420..dee553c503d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,7 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - +#include #include #include #include @@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } + if (thread_info_flags & _TIF_PERF_COUNTERS) { + clear_thread_flag(TIF_PERF_COUNTERS); + perf_counter_notify(regs); + } + #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c..496726ddcea 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,4 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_perf_counter_open -- cgit v1.2.3 From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 14:20:16 +0100 Subject: x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters disabled Impact: make perfcounter NMI and IRQ sequence more robust Make __smp_perf_counter_interrupt() a bit more conservative: first disable all counters, then read out the status. Most invocations are because there are real events, so there's no performance impact. Code flow gets a bit simpler as well this way. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82440cbed0e..615e953208e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc; u64 ack, status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - if (!status) { - ack_APIC_irq(); - return; - } - /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) + goto out; + again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { @@ -440,7 +438,7 @@ again: rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (status) goto again; - +out: /* * Do not reenable when global enable is off: */ -- cgit v1.2.3 From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 11:40:46 +0100 Subject: perfcounters, x86: simplify disable/enable of counters Impact: fix spurious missed counter wakeups In the case of NMI events, close a race window that can occur if an NMI hits counter code that temporarily disables+enables a counter, and the NMI leaks into the disabled section. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 40 ++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 615e953208e..7d528ffc2d2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -136,14 +136,25 @@ void hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } +static inline void +__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +{ + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) { per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; wrmsr(hwc->counter_base + idx, hwc->next_count, 0); - wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } void hw_perf_counter_enable(struct perf_counter *counter) @@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __hw_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; - counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + __hw_perf_counter_set_period(hwc, idx); __hw_perf_counter_enable(hwc, idx); } @@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(hwc->config_base + idx, hwc->config, 0); + __hw_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } } +/* + * NMI-safe enable method: + */ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + u64 pmc_ctrl; + int err; - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); - if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { - __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_set_period(hwc, idx); + + if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) __hw_perf_counter_enable(hwc, idx); - } } static void -- cgit v1.2.3 From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:18:18 +0100 Subject: perfcounters, x86: clean up debug code Impact: cleanup Get rid of unused debug code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d528ffc2d2..919ec46679b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter, { s64 raw = -1; s64 delta; - int err; /* * Get the raw hw counter value: */ - err = rdmsrl_safe(hwc->counter_base + idx, &raw); - WARN_ON_ONCE(err); + rdmsrl(hwc->counter_base + idx, raw); /* * Rebase it to zero (it started counting at -irq_period), @@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter, void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; - int cpu, err, idx; + int cpu, idx; + + if (!nr_hw_counters) + return; local_irq_disable(); cpu = smp_processor_id(); - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); - WARN_ON_ONCE(err); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); @@ -273,11 +269,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); for (idx = 0; idx < nr_hw_counters; idx++) { - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); + rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); next_count = per_cpu(prev_next_count[idx], cpu); @@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter) unsigned long addr = hwc->counter_base + hwc->idx; s64 offs, val = -1LL; s32 val32; - int err; /* Careful: NMI might modify the counter offset */ do { offs = hwc->prev_count; - err = rdmsrl_safe(addr, &val); - WARN_ON_ONCE(err); + rdmsrl(addr, val); } while (offs != hwc->prev_count); val32 = (s32) val; @@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; u64 pmc_ctrl; - int err; - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); __hw_perf_save_counter(counter, hwc, idx); __hw_perf_counter_set_period(hwc, idx); -- cgit v1.2.3 From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:23:59 +0100 Subject: perfcounters: consolidate global-disable codepaths Impact: cleanup Simplify global disable handling. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 919ec46679b..6a93d1f04d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[MAX_HW_COUNTERS]; unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; - int enable_all; }; /* @@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) return 0; } -static void __hw_perf_enable_all(void) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); -} - void hw_perf_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 1; - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } void hw_perf_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 0; wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } @@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); + u64 ack, status, saved_global; struct cpu_hw_counters *cpuc; - u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); @@ -445,10 +435,9 @@ again: goto again; out: /* - * Do not reenable when global enable is off: + * Restore - do not reenable when global enable is off: */ - if (cpuc->enable_all) - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6a93d1f04d9..0a7f3bea2dc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -119,10 +120,21 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_disable_all(void) +void hw_perf_restore_ctrl(u64 ctrl) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); + +u64 hw_perf_disable_all(void) +{ + u64 ctrl; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } +EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.2.3 From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0a7f3bea2dc..30e7ebf7827 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +int hw_perf_counter_init(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->__irq_period; + hwc->irq_period = counter->event.hw_event_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->next_count = -((s32) hwc->irq_period); /* - * Negative event types mean raw encoded event+umask values: + * Raw event type provide the config in the event structure */ - if (hw_event_type < 0) { - counter->hw_event_type = -hw_event_type; - counter->hw_event_type &= ~PERF_COUNT_NMI; + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type == PERF_COUNT_RAW) { + hwc->config |= counter->event.hw_raw_ctrl; } else { - hw_event_type &= ~PERF_COUNT_NMI; if (hw_event_type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event_type]; } - hwc->config |= counter->hw_event_type; counter->wakeup_pending = 0; return 0; @@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, counter->event.hw_event_type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -418,7 +417,8 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + counter->event.hw_event_type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.2.3 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf7827..ef1936a871a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); -- cgit v1.2.3 From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ef1936a871a..54b4ad0cce6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter) } static void -perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { - struct perf_counter_context *ctx = leader->ctx; - struct perf_counter *counter; + struct perf_counter *counter, *group_leader = sibling->group_leader; int bit; - list_for_each_entry(counter, &ctx->counters, list) { - if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || - counter == leader) - continue; + /* + * Store the counter's own timestamp first: + */ + perf_store_irq_data(sibling, sibling->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - if (counter->active) { + /* + * Then store sibling timestamps (if any): + */ + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + if (!counter->active) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it @@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event.type); - perf_store_irq_data(leader, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(counter)); } } @@ -416,10 +420,6 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, - counter->hw_event.type); - perf_store_irq_data(counter, - atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); break; } -- cgit v1.2.3 From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 54b4ad0cce6..718b635dece 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; @@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void) EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void -__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) { wrmsr(hwc->config_base + idx, hwc->config, 0); } @@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) wrmsr(hwc->counter_base + idx, hwc->next_count, 0); } -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -void hw_perf_counter_enable(struct perf_counter *counter) +static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; __hw_perf_counter_set_period(hwc, idx); - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } #ifdef CONFIG_X86_64 @@ -282,20 +282,20 @@ void perf_counter_print_debug(void) local_irq_enable(); } -void hw_perf_counter_disable(struct perf_counter *counter) +static void x86_perf_counter_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; __hw_perf_save_counter(counter, hwc, idx); } -void hw_perf_counter_read(struct perf_counter *counter) +static void x86_perf_counter_read(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; unsigned long addr = hwc->counter_base + hwc->idx; @@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } static void @@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } + +static struct hw_perf_counter_ops x86_perf_counter_ops = { + .hw_perf_counter_enable = x86_perf_counter_enable, + .hw_perf_counter_disable = x86_perf_counter_disable, + .hw_perf_counter_read = x86_perf_counter_read, +}; + +struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return NULL; + + return &x86_perf_counter_ops; +} -- cgit v1.2.3 From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 718b635dece..43c8e9a38b4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter) __x86_perf_counter_enable(hwc, idx); } -#ifdef CONFIG_X86_64 -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} -#else -/* - * Todo: add proper atomic64_t support to 32-bit x86: - */ -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} -#endif - static void __hw_perf_save_counter(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter) } while (offs != hwc->prev_count); val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; + val = (s64)hwc->irq_period + (s64)val32; atomic64_counter_set(counter, hwc->prev_count + val); } @@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } -static struct hw_perf_counter_ops x86_perf_counter_ops = { +static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, .hw_perf_counter_read = x86_perf_counter_read, }; -struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { int err; -- cgit v1.2.3 From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 43c8e9a38b4..3e1dbebe22b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -118,13 +118,13 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore_ctrl(u64 ctrl) +void hw_perf_restore(u64 ctrl) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } -EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); +EXPORT_SYMBOL_GPL(hw_perf_restore); -u64 hw_perf_disable_all(void) +u64 hw_perf_save_disable(void) { u64 ctrl; @@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); return ctrl; } -EXPORT_SYMBOL_GPL(hw_perf_disable_all); +EXPORT_SYMBOL_GPL(hw_perf_save_disable); static inline void __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) -- cgit v1.2.3 From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3e1dbebe22b..4854cca7fff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Then store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it -- cgit v1.2.3 From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 20:22:35 +0100 Subject: x86: implement atomic64_t on 32-bit Impact: new API Implement the atomic64_t APIs on 32-bit as well. Will be used by the performance counters code. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 218 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index ad5b9f6ecdd..9927e01b03c 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() +/* An 64bit atomic type */ + +typedef struct { + unsigned long long counter; +} atomic64_t; + +#define ATOMIC64_INIT(val) { (val) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define __atomic64_read(ptr) ((ptr)->counter) + +static inline unsigned long long +cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) +{ + asm volatile( + + LOCK_PREFIX "cmpxchg8b (%[ptr])\n" + + : "=A" (old) + + : [ptr] "D" (ptr), + "A" (old), + "b" (ll_low(new)), + "c" (ll_high(new)) + + : "memory"); + + return old; +} + +static inline unsigned long long +atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, + unsigned long long new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) +{ + unsigned long long old_val; + + do { + old_val = atomic_read(ptr); + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); +} + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline unsigned long long atomic64_read(atomic64_t *ptr) +{ + unsigned long long curr_val; + + do { + curr_val = __atomic64_read(ptr); + } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); + + return curr_val; +} + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +static inline unsigned long long +atomic64_add_return(unsigned long long delta, atomic64_t *ptr) +{ + unsigned long long old_val, new_val; + + do { + old_val = atomic_read(ptr); + new_val = old_val + delta; + + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return new_val; +} + +static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} + +static inline long atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} + +static inline long atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int +atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) +{ + unsigned long long old_val = atomic64_sub_return(delta, ptr); + + return old_val == 0; +} + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +static inline void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +static inline void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int +atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) +{ + long long old_val = atomic64_add_return(delta, ptr); + + return old_val < 0; +} + #include #endif /* _ASM_X86_ATOMIC_32_H */ -- cgit v1.2.3 From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++----------------- 2 files changed, 125 insertions(+), 107 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2fdc186724..fe94490bab6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -643,7 +643,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) - select HAVE_PERF_COUNTERS + select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b903f8df72b..5afae13d8d5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] = const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static void +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + u64 prev_raw_count, new_raw_count, delta; + + WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count, so we do that by clipping the delta to 32 bits: + */ + delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + WARN_ON_ONCE((int)delta < 0); + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if (!hwc->irq_period) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -(s32)hwc->irq_period; + atomic64_set(&hwc->period_left, hwc->irq_period); /* * Raw event type provide the config in the event structure @@ -118,12 +160,6 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore(u64 ctrl) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); -} -EXPORT_SYMBOL_GPL(hw_perf_restore); - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void) } EXPORT_SYMBOL_GPL(hw_perf_save_disable); +void hw_perf_restore(u64 ctrl) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore); + static inline void -__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { - wrmsr(hwc->config_base + idx, hwc->config, 0); + int err; + + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + WARN_ON_ONCE(err); } -static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); -static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static void +__hw_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { - per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + s32 left = atomic64_read(&hwc->period_left); + s32 period = hwc->irq_period; + + WARN_ON_ONCE(period <= 0); + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + } - wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + WARN_ON_ONCE(left <= 0); + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)(s64)-left); + + wrmsr(hwc->counter_base + idx, -left, 0); } -static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void +__x86_perf_counter_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - __hw_perf_counter_set_period(hwc, idx); - __x86_perf_counter_enable(hwc, idx); -} - -static void __hw_perf_save_counter(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 raw = -1; - s64 delta; - - /* - * Get the raw hw counter value: - */ - rdmsrl(hwc->counter_base + idx, raw); - - /* - * Rebase it to zero (it started counting at -irq_period), - * to see the delta since ->prev_count: - */ - delta = (s64)hwc->irq_period + (s64)(s32)raw; - - atomic64_counter_set(counter, hwc->prev_count + delta); - - /* - * Adjust the ->prev_count offset - if we went beyond - * irq_period of units, then we got an IRQ and the counter - * was set back to -irq_period: - */ - while (delta >= (s64)hwc->irq_period) { - hwc->prev_count += hwc->irq_period; - delta -= (s64)hwc->irq_period; - } - - /* - * Calculate the next raw counter value we'll write into - * the counter at the next sched-in time: - */ - delta -= (s64)hwc->irq_period; - - hwc->next_count = (s32)delta; + __hw_perf_counter_set_period(counter, hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; if (!nr_hw_counters) @@ -241,14 +286,14 @@ void perf_counter_print_debug(void) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); - next_count = per_cpu(prev_next_count[idx], cpu); + prev_left = per_cpu(prev_left[idx], cpu); printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", - cpu, idx, next_count); + printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + cpu, idx, prev_left); } local_irq_enable(); } @@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; - __hw_perf_save_counter(counter, hwc, idx); -} -static void x86_perf_counter_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - unsigned long addr = hwc->counter_base + hwc->idx; - s64 offs, val = -1LL; - s32 val32; - - /* Careful: NMI might modify the counter offset */ - do { - offs = hwc->prev_count; - rdmsrl(addr, val); - } while (offs != hwc->prev_count); - - val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; - atomic64_counter_set(counter, hwc->prev_count + val); + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); } static void perf_store_irq_data(struct perf_counter *counter, u64 data) @@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } /* - * NMI-safe enable method: + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: */ static void perf_save_and_restart(struct perf_counter *counter) { @@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - __hw_perf_save_counter(counter, hwc, idx); - __hw_perf_counter_set_period(hwc, idx); + x86_perf_counter_update(counter, hwc, idx); + __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } static void perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { struct perf_counter *counter, *group_leader = sibling->group_leader; - int bit; - - /* - * Store the counter's own timestamp first: - */ - perf_store_irq_data(sibling, sibling->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(sibling)); /* - * Then store sibling timestamps (if any): + * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - /* - * When counter was not in the overflow mask, we have to - * read it from hardware. We read it as well, when it - * has not been read yet and clear the bit in the - * status mask. - */ - bit = counter->hw.idx; - if (!test_bit(bit, (unsigned long *) overflown) || - test_bit(bit, (unsigned long *) status)) { - clear_bit(bit, (unsigned long *) status); - perf_save_and_restart(counter); - } - } + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } @@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } +static void x86_perf_counter_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, -- cgit v1.2.3 From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 18:36:30 +0100 Subject: perfcounters: fix non-intel-perfmon CPUs Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5afae13d8d5..6d30f603b62 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) void hw_perf_enable_all(void) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } @@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void) { u64 ctrl; + if (unlikely(!perf_counters_initialized)) + return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } EXPORT_SYMBOL_GPL(hw_perf_save_disable); void hw_perf_restore(u64 ctrl) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } EXPORT_SYMBOL_GPL(hw_perf_restore); -- cgit v1.2.3 From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 21:58:46 +0100 Subject: perfcounters: fix lapic initialization Fix non-working NMI sampling in certain bootup scenarios. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d30f603b62..8a154bd7ba9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + perf_counters_initialized = true; + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); - - perf_counters_initialized = true; } static void x86_perf_counter_read(struct perf_counter *counter) -- cgit v1.2.3 From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:37:58 +0530 Subject: x86: perf_counter.c intel_perfmon_event_map and max_intel_perfmon_events should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8a154bd7ba9..bdbdb56eaa3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,7 +41,7 @@ struct cpu_hw_counters { */ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); -const int intel_perfmon_event_map[] = +static const int intel_perfmon_event_map[] = { [PERF_COUNT_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Propagate counter elapsed time into the generic counter. -- cgit v1.2.3 From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bdbdb56eaa3..89fad5d4fb3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter, { u64 prev_raw_count, new_raw_count, delta; - WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); /* * Careful: an NMI might modify the previous counter value. * @@ -89,7 +88,6 @@ again: * of the count, so we do that by clipping the delta to 32 bits: */ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); - WARN_ON_ONCE((int)delta < 0); atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); @@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter, int err; err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); - WARN_ON_ONCE(err); } static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); @@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, s32 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; - WARN_ON_ONCE(period <= 0); - /* * If we are way outside a reasoable range then just skip forward: */ @@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->period_left, left); } - WARN_ON_ONCE(left <= 0); - per_cpu(prev_left[idx], smp_processor_id()) = left; /* -- cgit v1.2.3 From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:02:19 +0100 Subject: x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h Impact: rename include file We'll be providing an asm/perf_counter.h to the generic perfcounter code, so use the already existing x86 file for this purpose and rename it. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel_arch_perfmon.h | 41 ------------------------------- arch/x86/include/asm/perf_counter.h | 41 +++++++++++++++++++++++++++++++ arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/oprofile/op_model_ppro.c | 2 +- 7 files changed, 46 insertions(+), 46 deletions(-) delete mode 100644 arch/x86/include/asm/intel_arch_perfmon.h create mode 100644 arch/x86/include/asm/perf_counter.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h deleted file mode 100644 index 71598a9eab6..00000000000 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H -#define _ASM_X86_INTEL_ARCH_PERFMON_H - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 - -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(int nmi); -#else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(int nmi) { } -#endif - -#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h new file mode 100644 index 00000000000..9dadce1124e --- /dev/null +++ b/arch/x86/include/asm/perf_counter.h @@ -0,0 +1,41 @@ +#ifndef _ASM_X86_PERF_COUNTER_H +#define _ASM_X86_PERF_COUNTER_H + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#ifdef CONFIG_PERF_COUNTERS +extern void init_hw_perf_counters(void); +extern void perf_counters_lapic_init(int nmi); +#else +static inline void init_hw_perf_counters(void) { } +static inline void perf_counters_lapic_init(int nmi) { } +#endif + +#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0579ec1cd6e..4f859acb156 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4461011db47..ad331b4d623 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 89fad5d4fb3..a4a3a09a654 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include static bool perf_counters_initialized __read_mostly; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9abd48b2267..d6f5b9fbde3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index e9f80c744cf..07c914555a5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "op_x86_model.h" #include "op_counter.h" -- cgit v1.2.3 From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 11 ++++++++ arch/x86/kernel/cpu/perf_counter.c | 53 ++++++++++++++++++------------------- 2 files changed, 37 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 9dadce1124e..dd5a4a559e2 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -1,6 +1,13 @@ #ifndef _ASM_X86_PERF_COUNTER_H #define _ASM_X86_PERF_COUNTER_H +/* + * Performance counter hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -20,6 +27,10 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ union cpuid10_eax { struct { unsigned int version_id:8; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a4a3a09a654..fc3af868823 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; -/* No support for fixed function counters yet */ - -#define MAX_HW_COUNTERS 8 - struct cpu_hw_counters { - struct perf_counter *counters[MAX_HW_COUNTERS]; - unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + struct perf_counter *generic[X86_PMC_MAX_GENERIC]; + unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; + + struct perf_counter *fixed[X86_PMC_MAX_FIXED]; + unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; }; /* @@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void -__x86_perf_counter_disable(struct perf_counter *counter, +__pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; @@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter, err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, } static void -__x86_perf_counter_enable(struct perf_counter *counter, +__pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, @@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter, /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void x86_perf_counter_enable(struct perf_counter *counter) +static void pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); - cpuc->counters[idx] = counter; + cpuc->generic[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } void perf_counter_print_debug(void) @@ -301,16 +300,16 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void x86_perf_counter_disable(struct perf_counter *counter) +static void pmc_generic_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; + cpuc->generic[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } static void @@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -412,7 +411,7 @@ again: } /* * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these counters as + * do a task wakeup - but we mark these generic as * wakeup_pending and initate a wakeup callback: */ if (nmi) { @@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs) cpuc = &per_cpu(cpu_hw_counters, cpu); for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; if (!counter) continue; @@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > MAX_HW_COUNTERS) { - nr_hw_counters = MAX_HW_COUNTERS; + if (nr_hw_counters > X86_PMC_MAX_GENERIC) { + nr_hw_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, MAX_HW_COUNTERS); + nr_hw_counters, X86_PMC_MAX_GENERIC); } perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; @@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_perf_counter_read(struct perf_counter *counter) +static void pmc_generic_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = x86_perf_counter_enable, - .hw_perf_counter_disable = x86_perf_counter_disable, - .hw_perf_counter_read = x86_perf_counter_read, + .hw_perf_counter_enable = pmc_generic_enable, + .hw_perf_counter_disable = pmc_generic_disable, + .hw_perf_counter_read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.2.3 From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 10:51:15 +0100 Subject: perfcounters: add fixed-mode PMC enumeration Enumerate fixed-mode PMCs based on CPUID, and feed that into the perfcounter code. Does not use fixed-mode PMCs yet. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 23 +++++++++++++++++++++++ arch/x86/kernel/cpu/perf_counter.c | 23 +++++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index dd5a4a559e2..945a315e6d6 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -41,6 +41,29 @@ union cpuid10_eax { unsigned int full; }; +union cpuid10_edx { + struct { + unsigned int num_counters_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance counters: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc3af868823..2fca50c4597 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; +static int nr_hw_counters_fixed __read_mostly; + struct cpu_hw_counters { struct perf_counter *generic[X86_PMC_MAX_GENERIC]; unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; @@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { void __init init_hw_perf_counters(void) { union cpuid10_eax eax; - unsigned int unused; unsigned int ebx; + unsigned int unused; + union cpuid10_edx edx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); + cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; if (nr_hw_counters > X86_PMC_MAX_GENERIC) { nr_hw_counters = X86_PMC_MAX_GENERIC; @@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; - printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + + nr_hw_counters_fixed = edx.split.num_counters_fixed; + if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { + nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + } + printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); perf_counters_initialized = true; -- cgit v1.2.3 From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 13:09:20 +0100 Subject: x86, perfcounters: refactor code for fixed-function PMCs Impact: clean up Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 14 ++++++- arch/x86/kernel/cpu/perf_counter.c | 73 ++++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 945a315e6d6..13745deb16c 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -8,6 +8,10 @@ #define X86_PMC_MAX_GENERIC 8 #define X86_PMC_MAX_FIXED 3 +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -54,6 +58,15 @@ union cpuid10_edx { * Fixed-purpose performance counters: */ +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 @@ -63,7 +76,6 @@ union cpuid10_edx { /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b - #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2fca50c4597..358af526640 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly; /* * Number of (generic) HW counters: */ -static int nr_hw_counters __read_mostly; -static u32 perf_counter_mask __read_mostly; +static int nr_counters_generic __read_mostly; +static u64 perf_counter_mask __read_mostly; -static int nr_hw_counters_fixed __read_mostly; +static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { - struct perf_counter *generic[X86_PMC_MAX_GENERIC]; - unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; - - struct perf_counter *fixed[X86_PMC_MAX_FIXED]; - unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; /* @@ -159,7 +156,7 @@ void hw_perf_enable_all(void) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); } u64 hw_perf_save_disable(void) @@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void) return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } @@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +static int fixed_mode_idx(struct hw_perf_counter *hwc) +{ + return -1; +} + /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ @@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); - cpuc->generic[idx] = counter; + cpuc->counters[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -270,7 +272,7 @@ void perf_counter_print_debug(void) u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; - if (!nr_hw_counters) + if (!nr_counters_generic) return; local_irq_disable(); @@ -286,7 +288,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - for (idx = 0; idx < nr_hw_counters; idx++) { + for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); @@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->generic[idx] = NULL; + cpuc->counters[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -424,7 +426,7 @@ again: } } - wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); /* * Repeat if there is more work to be done: @@ -436,7 +438,7 @@ out: /* * Restore - do not reenable when global enable is off: */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); } void smp_perf_counter_interrupt(struct pt_regs *regs) @@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; if (!counter) continue; @@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); - nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > X86_PMC_MAX_GENERIC) { - nr_hw_counters = X86_PMC_MAX_GENERIC; + nr_counters_generic = eax.split.num_counters; + if (nr_counters_generic > X86_PMC_MAX_GENERIC) { + nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, X86_PMC_MAX_GENERIC); + nr_counters_generic, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_hw_counters) - 1; - perf_max_counters = nr_hw_counters; + perf_counter_mask = (1 << nr_counters_generic) - 1; + perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - nr_hw_counters_fixed = edx.split.num_counters_fixed; - if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { - nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + nr_counters_fixed = edx.split.num_counters_fixed; + if (nr_counters_fixed > X86_PMC_MAX_FIXED) { + nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); + printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + + perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.2.3 From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 358af526640..b6755712142 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = pmc_generic_enable, - .hw_perf_counter_disable = pmc_generic_disable, - .hw_perf_counter_read = pmc_generic_read, + .enable = pmc_generic_enable, + .disable = pmc_generic_disable, + .read = pmc_generic_read, }; const struct hw_perf_counter_ops * -- cgit v1.2.3 From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b6755712142..74090a393a7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void pmc_generic_enable(struct perf_counter *counter) +static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); + + return 0; } void perf_counter_print_debug(void) -- cgit v1.2.3 From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:28:12 +0100 Subject: x86, perfcounters: print out the ->used bitmask Impact: extend debug printouts Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 74090a393a7..f3359c2b391 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter) idx = find_first_zero_bit(cpuc->used, nr_counters_generic); if (idx == nr_counters_generic) return -EAGAIN; + set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + struct cpu_hw_counters *cpuc; int cpu, idx; if (!nr_counters_generic) @@ -282,6 +284,7 @@ void perf_counter_print_debug(void) local_irq_disable(); cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -291,6 +294,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); -- cgit v1.2.3 From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f3359c2b391..86b2fdd344a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); static const int intel_perfmon_event_map[] = { - [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, [PERF_COUNT_CACHE_MISSES] = 0x412e, [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_BUS_CYCLES] = 0x013c, }; static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); -- cgit v1.2.3 From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Dec 2008 11:10:42 +0100 Subject: x86, perfcounters: add support for fixed-function pmcs Impact: extend performance counter support on x86 Intel CPUs Modern Intel CPUs have 3 "fixed-function" performance counters, which count these hardware events: Instr_Retired.Any CPU_CLK_Unhalted.Core CPU_CLK_Unhalted.Ref Add support for them to the performance counters subsystem. Their use is transparent to user-space: the counter scheduler is extended to automatically recognize the cases where a fixed-function PMC can be utilized instead of a generic PMC. In such cases the generic PMC is kept available for more counters. The above fixed-function events map to these generic counter hw events: PERF_COUNT_INSTRUCTIONS PERF_COUNT_CPU_CYCLES PERF_COUNT_BUS_CYCLES (The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed frequency.) Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 8 ++ arch/x86/kernel/cpu/perf_counter.c | 149 ++++++++++++++++++++++++++++++------ 2 files changed, 133 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 13745deb16c..2e08ed73664 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -23,6 +23,11 @@ #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 @@ -69,12 +74,15 @@ union cpuid10_edx { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 86b2fdd344a..da46eca1254 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly; */ static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; +static u64 counter_value_mask __read_mostly; static int nr_counters_fixed __read_mostly; @@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, @@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl) } EXPORT_SYMBOL_GPL(hw_perf_restore); +static inline void +__pmc_fixed_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + int err; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_disable(counter, hwc, idx); + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -202,8 +218,9 @@ static void __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - s32 left = atomic64_read(&hwc->period_left); + s64 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; + int err; /* * If we are way outside a reasoable range then just skip forward: @@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter, * The hw counter starts counting from this counter offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)(s64)-left); + atomic64_set(&hwc->prev_count, (u64)-left); - wrmsr(hwc->counter_base + idx, -left, 0); + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & counter_value_mask); +} + +static inline void +__pmc_fixed_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8) and ring-3 counting (0x2), + * and enable ring-0 counting if allowed: + */ + bits = 0x8ULL | 0x2ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); } static void __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_enable(counter, hwc, idx); + wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -static int fixed_mode_idx(struct hw_perf_counter *hwc) +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { + unsigned int event; + + if (unlikely(hwc->nmi)) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + return -1; } @@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; + int idx; - /* Try to get the previous counter again */ - if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) - return -EAGAIN; + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used)) + goto try_generic; - set_bit(idx, cpuc->used); + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used)) { +try_generic: + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; + + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; } perf_counters_lapic_init(hwc->nmi); @@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); cpuc->counters[idx] = counter; + /* + * Make it visible before enabling the hw: + */ + smp_wmb(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; int cpu, idx; @@ -290,11 +377,13 @@ void perf_counter_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -303,13 +392,19 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } + for (idx = 0; idx < nr_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } local_irq_enable(); } @@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter) clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + smp_wmb(); /* * Drain the remaining delta count out of a counter @@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; - u64 pmc_ctrl; - - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); x86_perf_counter_update(counter, hwc, idx); __hw_perf_counter_set_period(counter, hwc, idx); - if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (counter->state == PERF_COUNTER_STATE_ACTIVE) __pmc_generic_enable(counter, hwc, idx); } @@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); perf_store_irq_data(sibling, atomic64_read(&counter->count)); @@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); @@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void) perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); nr_counters_fixed = edx.split.num_counters_fixed; -- cgit v1.2.3 From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 27 Dec 2008 19:15:43 +0530 Subject: x86: perf_counter remove unwanted hw_perf_enable_all Impact: clean, reduce kernel size a bit, avoid sparse warnings Fixes sparse warnings: arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da46eca1254..9376771f757 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -void hw_perf_enable_all(void) -{ - if (unlikely(!perf_counters_initialized)) - return; - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); -} - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -200,12 +192,10 @@ static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { - int err; - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_disable(counter, hwc, idx); - - err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + __pmc_fixed_disable(counter, hwc, idx); + else + wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_enable(counter, hwc, idx); - - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __pmc_fixed_enable(counter, hwc, idx); + else + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } static int -- cgit v1.2.3 From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7b434e5b14c..849c23009bf 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); + + perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9376771f757..1a040b179b5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 last_interrupt; + u64 global_enable; + int throttled; }; /* @@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, saved_global; - struct cpu_hw_counters *cpuc; + u64 ack, status, now; + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); /* Disable counters globally */ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + now = sched_clock(); + if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) + cpuc->throttled = 1; + cpuc->last_interrupt = now; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) @@ -533,9 +539,29 @@ again: goto again; out: /* - * Restore - do not reenable when global enable is off: + * Restore - do not reenable when global enable is off or throttled: */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + if (!cpuc->throttled) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); +} + +void perf_counter_unthrottle(void) +{ + struct cpu_hw_counters *cpuc; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + if (unlikely(!perf_counters_initialized)) + return; + + cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + if (cpuc->throttled) { + if (printk_ratelimit()) + printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttled = 0; + } } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1a040b179b5..a56d4cf92f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 last_interrupt; + unsigned long interrupts; u64 global_enable; - int throttled; }; /* @@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) } } +/* + * Maximum interrupt frequency of 100KHz per CPU + */ +#define PERFMON_MAX_INTERRUPTS 100000/HZ + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, now; + u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); @@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - now = sched_clock(); - if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) - cpuc->throttled = 1; - cpuc->last_interrupt = now; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) goto out; @@ -541,13 +540,14 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (!cpuc->throttled) + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; + u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -556,12 +556,15 @@ void perf_counter_unthrottle(void) return; cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); - if (cpuc->throttled) { + if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - cpuc->throttled = 0; } + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); + if (unlikely(cpuc->global_enable && !global_enable)) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:16:53 +0100 Subject: perfcounters fix section mismatch warning in perf_counter.c::perf_counters_lapic_init() Fix: WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init() The function pmc_generic_enable() references the function __cpuinit perf_counters_lapic_init(). This is often because pmc_generic_enable lacks a __cpuinit annotation or the annotation of perf_counters_lapic_init is wrong. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a56d4cf92f3..46c436cdd73 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs) local_irq_restore(flags); } -void __cpuinit perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(int nmi) { u32 apic_val; -- cgit v1.2.3 From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 25 Jan 2009 02:38:09 -0800 Subject: x86: make irqinit_32.c more like irqinit_64.c, v2 Impact: cleanup 1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit 2. move the apic_intr_init() call before set gate with interrupt[i] 3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector. [ v2: should use !test_bit() instead of test_bit() with 32bit ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 56 ++++++++++++++++++++++++++------------------ arch/x86/kernel/irqinit_64.c | 7 +++--- arch/x86/kernel/traps.c | 15 +++++------- 3 files changed, 43 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c56496f8c6f..ddf3eb72f86 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -170,8 +150,13 @@ void __init native_init_IRQ(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +} +static void __init apic_intr_init(void) +{ #ifdef CONFIG_X86_LOCAL_APIC + smp_intr_init(); + /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -181,12 +166,37 @@ void __init native_init_IRQ(void) # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); # endif -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +# ifdef CONFIG_X86_MCE_P4THERMAL /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +# endif #endif +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 6a71bfc51e5..16e1fc68750 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -162,6 +162,9 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become @@ -169,12 +172,10 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) + if (!test_bit(vector, used_vectors)) set_intr_gate(vector, interrupt[i]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ed5aee5f3fc..d36a502d87a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -979,8 +979,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -997,17 +1002,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ -- cgit v1.2.3 From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 1 Feb 2009 22:07:39 +0530 Subject: x86: irqinit_32.c fix compilation warning Fix: arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ddf3eb72f86..520e6c1c5d2 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -154,9 +154,9 @@ static void __init smp_intr_init(void) static void __init apic_intr_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC smp_intr_init(); +#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); -- cgit v1.2.3 From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Feb 2009 17:11:34 +0100 Subject: perfcounters: fix "perf counters kill oprofile" bug With oprofile as a module, and unloaded by profiling script, both oprofile and kerneltop work fine.. unless you leave kerneltop running when you start profiling, then you may see badness. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 +++- arch/x86/oprofile/nmi_int.c | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46c436cdd73..8bb213323fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self, } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 }; void __init init_hw_perf_counters(void) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a..c638685136e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self, switch (val) { case DIE_NMI: - if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) - ret = NOTIFY_STOP; + case DIE_NMI_IPI: + model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); + ret = NOTIFY_STOP; break; default: break; @@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy) static struct notifier_block profile_exceptions_nb = { .notifier_call = profile_exceptions_notify, .next = NULL, - .priority = 0 + .priority = 2 }; static int nmi_setup(void) -- cgit v1.2.3 From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Feb 2009 15:23:08 +0100 Subject: perfcounters: fix "perf counters kills oprofile" bug, v2 Impact: fix kernel crash Both oprofile and perfcounters register an NMI die handler, but only one can handle the NMI. Conveniently, oprofile unregisters it's notifier when not actively in use, so setting it's notifier priority higher than perfcounter's allows oprofile to borrow the NMI for the duration of it's run. Tested/works both as module and built-in. While testing, I found that if kerneltop was generating NMIs at very high frequency, the kernel may panic when oprofile registered it's handler. This turned out to be because oprofile registers it's handler before reset_value has been allocated, so if an NMI comes in while it's still setting up, kabOom. Rather than try more invasive changes, I followed the lead of other places in op_model_ppro.c, and simply returned in that highly unlikely event. (debug warnings attached) Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_ppro.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 07c914555a5..85eb6268374 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, u64 val; int i; + /* + * This can happen if perf counters are in use when + * we steal the die notifier NMI. + */ + if (unlikely(!reset_value)) + goto out; + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; @@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, } } +out: /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); -- cgit v1.2.3 From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 9 Feb 2009 07:38:50 +0100 Subject: perf_counters: account NMI interrupts I noticed that kerneltop interrupts were accounted as NMI, but not their perf counter origin. Account NMI performance counter interrupts. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8bb213323fe..9901e46998d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) goto out; again: + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; @@ -570,7 +571,6 @@ void perf_counter_unthrottle(void) void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); - inc_irq_stat(apic_perf_irqs); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); __smp_perf_counter_interrupt(regs, 0); -- cgit v1.2.3 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/x86/kernel/cpu/perf_counter.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46998d..383d4c6423a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. */ - hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + + /* + * If privileged enough, allow NMI events: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); -- cgit v1.2.3 From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Feb 2009 01:08:17 +0100 Subject: perfcounters: remove duplicate definition of LOCAL_PERF_VECTOR Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index b66b518ff00..b07278c55e9 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -111,11 +111,6 @@ */ #define LOCAL_PERF_VECTOR 0xee -/* - * Performance monitoring interrupt vector: - */ -#define LOCAL_PERF_VECTOR 0xee - /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority -- cgit v1.2.3 From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 18:09:09 +0530 Subject: x86: prepare perf_counter to add more cpus Introduced struct pmc_x86_ops to add more cpus. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 383d4c6423a..a3c88529bb7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -3,6 +3,7 @@ * * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * Copyright(C) 2009 Jaswinder Singh Rajput * * For licencing details see kernel-base/COPYING */ @@ -38,10 +39,24 @@ struct cpu_hw_counters { }; /* - * Intel PerfMon v3. Used on Core2 and later. + * struct pmc_x86_ops - performance counter x86 ops */ +struct pmc_x86_ops { + u64 (*save_disable_all) (void); + void (*restore_all) (u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map) (int event); + int max_events; +}; + +static struct pmc_x86_ops *pmc_ops; + static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +/* + * Intel PerfMon v3. Used on Core2 and later. + */ static const int intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, @@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static int pmc_intel_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} /* * Propagate counter elapsed time into the generic counter. @@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (hw_event->raw) { hwc->config |= hw_event->type; } else { - if (hw_event->type >= max_intel_perfmon_events) + if (hw_event->type >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event->type]; + hwc->config |= pmc_ops->event_map(hw_event->type); } counter->wakeup_pending = 0; return 0; } -u64 hw_perf_save_disable(void) +static u64 pmc_intel_save_disable_all(void) { u64 ctrl; - if (unlikely(!perf_counters_initialized)) - return 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } + +u64 hw_perf_save_disable(void) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->save_disable_all(); +} EXPORT_SYMBOL_GPL(hw_perf_save_disable); +static void pmc_intel_restore_all(u64 ctrl) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) return; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + pmc_ops->restore_all(ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -339,8 +367,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = pmc_ops->eventsel; + hwc->counter_base = pmc_ops->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -386,8 +414,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); + rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); + rdmsrl(pmc_ops->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -void __init init_hw_perf_counters(void) +static struct pmc_x86_ops pmc_intel_ops = { + .save_disable_all = pmc_intel_save_disable_all, + .restore_all = pmc_intel_restore_all, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = pmc_intel_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), +}; + +static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; unsigned int ebx; unsigned int unused; union cpuid10_edx edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return; + return NULL; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + nr_counters_generic = eax.split.num_counters; + nr_counters_fixed = edx.split.num_counters_fixed; + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + + return &pmc_intel_ops; +} + +void __init init_hw_perf_counters(void) +{ + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + pmc_ops = pmc_intel_init(); + break; + } + if (!pmc_ops) + return; + + printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - counter_value_mask = (1ULL << eax.split.bit_width) - 1; printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - - nr_counters_fixed = edx.split.num_counters_fixed; if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", -- cgit v1.2.3 From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 20:15:14 +0530 Subject: x86: AMD Support for perf_counter Supported basic performance counter for AMD K7 and later: $ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null Performance counter stats for 'ls': 12.298610 task clock ticks (msecs) 3298477 CPU cycles (events) 1406354 instructions (events) 749035 cache references (events) 16939 cache misses (events) 100589 branches (events) 11159 branch misses (events) 7.627540 cpu clock ticks (msecs) 12.298610 task clock ticks (msecs) 500 pagefaults (events) 6 context switches (events) 3 CPU migrations (events) Wall-clock time elapsed: 8.672290 msecs Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ++ arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..edcde52bd17 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + /* Enable Performance counter for K7 and later */ + if (c->x86 > 6 && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a3c88529bb7..266618aa1a0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * AMD Performance Monitor K7 and later. + */ +static const int amd_perfmon_event_map[] = +{ + [PERF_COUNT_CPU_CYCLES] = 0x0076, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_CACHE_MISSES] = 0x0081, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +static int pmc_amd_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) + hwc->irq_period = 0x7FFFFFFF; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } +static u64 pmc_amd_save_disable_all(void) +{ + int idx; + u64 val, ctrl = 0; + + for (idx = 0; idx < nr_counters_generic; idx++) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + ctrl |= (1 << idx); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + + return ctrl; +} + u64 hw_perf_save_disable(void) { if (unlikely(!perf_counters_initialized)) @@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } +static void pmc_amd_restore_all(u64 ctrl) +{ + u64 val; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + if (ctrl & (1 << idx)) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + } +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) @@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -1; + if (unlikely(hwc->nmi)) return -1; @@ -401,6 +453,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -411,6 +464,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + } printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -588,6 +642,9 @@ void perf_counter_unthrottle(void) if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + if (unlikely(!perf_counters_initialized)) return; @@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = { .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; +static struct pmc_x86_ops pmc_amd_ops = { + .save_disable_all = pmc_amd_save_disable_all, + .restore_all = pmc_amd_restore_all, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = pmc_amd_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), +}; + static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; @@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void) return &pmc_intel_ops; } +static struct pmc_x86_ops *pmc_amd_init(void) +{ + nr_counters_generic = 4; + nr_counters_fixed = 0; + + printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + + return &pmc_amd_ops; +} + void __init init_hw_perf_counters(void) { if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); break; + case X86_VENDOR_AMD: + pmc_ops = pmc_amd_init(); + break; } if (!pmc_ops) return; -- cgit v1.2.3 From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:37:49 +0530 Subject: x86: decent declarations in perf_counter.c Impact: cleanup making decent declrations for struct pmc_x86_ops and fix checkpatch error: ERROR: Macros with complex values should be enclosed in parenthesis Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 266618aa1a0..a1f3646a3e8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -42,12 +42,12 @@ struct cpu_hw_counters { * struct pmc_x86_ops - performance counter x86 ops */ struct pmc_x86_ops { - u64 (*save_disable_all) (void); - void (*restore_all) (u64 ctrl); - unsigned eventsel; - unsigned perfctr; - int (*event_map) (int event); - int max_events; + u64 (*save_disable_all)(void); + void (*restore_all)(u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map)(int event); + int max_events; }; static struct pmc_x86_ops *pmc_ops; @@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) /* * Maximum interrupt frequency of 100KHz per CPU */ -#define PERFMON_MAX_INTERRUPTS 100000/HZ +#define PERFMON_MAX_INTERRUPTS (100000/HZ) /* * This handler is triggered by the local APIC, so the APIC IRQ handling -- cgit v1.2.3 From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:45:39 +0530 Subject: x86: use pr_info in perf_counter.c Impact: cleanup using pr_info in perf_counter.c fixes various 80 characters warnings and also indenting for conditional statement Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a1f3646a3e8..3b65f19a668 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -454,18 +454,18 @@ void perf_counter_print_debug(void) cpuc = &per_cpu(cpu_hw_counters, cpu); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - printk(KERN_INFO "\n"); - printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); - printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); - printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); @@ -473,17 +473,17 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < nr_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_enable(); @@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; - printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + pr_info("Intel Performance Monitoring support detected.\n"); + pr_info("... version: %d\n", eax.split.version_id); + pr_info("... bit width: %d\n", eax.split.bit_width); + pr_info("... mask length: %d\n", eax.split.mask_length); nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + pr_info("AMD Performance Monitoring support detected.\n"); return &pmc_amd_ops; } @@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void) if (!pmc_ops) return; - printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); + pr_info("... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", counter_value_mask); if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", nr_counters_fixed); perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; - printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); -- cgit v1.2.3 From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 18:08:27 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs The below completes the K7+ performance counter support: - IRQ support - NMI support KernelTop output works now as well. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput Cc: Paul Mackerras LKML-Reference: <1236273633.5187.286.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 272 +++++++++++++++++++++++++++++++------ 1 file changed, 228 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3b65f19a668..6ebe9abf6ae 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly; static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; static u64 counter_value_mask __read_mostly; +static int counter_value_bits __read_mostly; static int nr_counters_fixed __read_mostly; @@ -35,7 +36,9 @@ struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 global_enable; + u64 throttle_ctrl; + u64 active_mask; + int enabled; }; /* @@ -43,21 +46,28 @@ struct cpu_hw_counters { */ struct pmc_x86_ops { u64 (*save_disable_all)(void); - void (*restore_all)(u64 ctrl); + void (*restore_all)(u64); + u64 (*get_status)(u64); + void (*ack_status)(u64); + void (*enable)(int, u64); + void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; - int (*event_map)(int event); + u64 (*event_map)(int); + u64 (*raw_event)(u64); int max_events; }; static struct pmc_x86_ops *pmc_ops; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; /* * Intel PerfMon v3. Used on Core2 and later. */ -static const int intel_perfmon_event_map[] = +static const u64 intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static int pmc_intel_event_map(int event) +static u64 pmc_intel_event_map(int event) { return intel_perfmon_event_map[event]; } +static u64 pmc_intel_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FF +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + /* * AMD Performance Monitor K7 and later. */ -static const int amd_perfmon_event_map[] = +static const u64 amd_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x0076, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static int pmc_amd_event_map(int event) +static u64 pmc_amd_event_map(int event) { return amd_perfmon_event_map[event]; } +static u64 pmc_amd_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FF +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (hw_event->raw) { - hwc->config |= hw_event->type; + hwc->config |= pmc_ops->raw_event(hw_event->type); } else { if (hw_event->type >= pmc_ops->max_events) return -EINVAL; @@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void) static u64 pmc_amd_save_disable_all(void) { - int idx; - u64 val, ctrl = 0; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int enabled, idx; + + enabled = cpuc->enabled; + cpuc->enabled = 0; + barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - ctrl |= (1 << idx); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } } - return ctrl; + return enabled; } u64 hw_perf_save_disable(void) @@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void) return pmc_ops->save_disable_all(); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); static void pmc_intel_restore_all(u64 ctrl) @@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl) static void pmc_amd_restore_all(u64 ctrl) { - u64 val; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; + cpuc->enabled = ctrl; + barrier(); + if (!ctrl) + return; + for (idx = 0; idx < nr_counters_generic; idx++) { - if (ctrl & (1 << idx)) { + if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl) pmc_ops->restore_all(ctrl); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_restore); +static u64 pmc_intel_get_status(u64 mask) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static u64 pmc_amd_get_status(u64 mask) +{ + u64 status = 0; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + s64 val; + + if (!(mask & (1 << idx))) + continue; + + rdmsrl(MSR_K7_PERFCTR0 + idx, val); + val <<= (64 - counter_value_bits); + if (val >= 0) + status |= (1 << idx); + } + + return status; +} + +static u64 hw_perf_get_status(u64 mask) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->get_status(mask); +} + +static void pmc_intel_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void pmc_amd_ack_status(u64 ack) +{ +} + +static void hw_perf_ack_status(u64 ack) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->ack_status(ack); +} + +static void pmc_intel_enable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, + config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static void pmc_amd_enable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + set_bit(idx, (unsigned long *)&cpuc->active_mask); + if (cpuc->enabled) + config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); +} + +static void hw_perf_enable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->enable(idx, config); +} + +static void pmc_intel_disable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); +} + +static void pmc_amd_disable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + clear_bit(idx, (unsigned long *)&cpuc->active_mask); + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); + +} + +static void hw_perf_disable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->disable(idx, config); +} + static inline void __pmc_fixed_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int __idx) @@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); else - wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + hw_perf_disable(idx, hwc->config); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); else - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + hw_perf_enable(idx, hwc->config); } static int @@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + int ret = 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - - /* Disable counters globally */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - ack_APIC_irq(); + cpuc->throttle_ctrl = hw_perf_save_disable(); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (!status) goto out; + ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -618,12 +773,12 @@ again: } } - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + hw_perf_ack_status(ack); /* * Repeat if there is more work to be done: */ - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -631,32 +786,27 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); + + return ret; } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - if (unlikely(!perf_counters_initialized)) return; - cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); } - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); - if (unlikely(cpuc->global_enable && !global_enable)) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); cpuc->interrupts = 0; } @@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + ack_APIC_irq(); __smp_perf_counter_interrupt(regs, 0); - irq_exit(); } @@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; + int ret; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; - if (likely(cmd != DIE_NMI_IPI)) + default: return NOTIFY_DONE; + } regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - __smp_perf_counter_interrupt(regs, 1); + ret = __smp_perf_counter_interrupt(regs, 1); - return NOTIFY_STOP; + return ret ? NOTIFY_STOP : NOTIFY_OK; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { @@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct pmc_x86_ops pmc_intel_ops = { .save_disable_all = pmc_intel_save_disable_all, .restore_all = pmc_intel_restore_all, + .get_status = pmc_intel_get_status, + .ack_status = pmc_intel_ack_status, + .enable = pmc_intel_enable, + .disable = pmc_intel_disable, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = pmc_intel_event_map, + .raw_event = pmc_intel_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; static struct pmc_x86_ops pmc_amd_ops = { .save_disable_all = pmc_amd_save_disable_all, .restore_all = pmc_amd_restore_all, + .get_status = pmc_amd_get_status, + .ack_status = pmc_amd_ack_status, + .enable = pmc_amd_enable, + .disable = pmc_amd_disable, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = pmc_amd_event_map, + .raw_event = pmc_amd_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; @@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { + u64 old; + int bits; + nr_counters_generic = 4; nr_counters_fixed = 0; + counter_value_mask = ~0ULL; + + rdmsrl(MSR_K7_PERFCTR0, old); + wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); + /* + * read the truncated mask + */ + rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); + wrmsrl(MSR_K7_PERFCTR0, old); + + bits = 32 + fls(counter_value_mask >> 32); + if (bits == 32) + bits = fls((u32)counter_value_mask); + counter_value_bits = bits; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.2.3 From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 20:34:21 +0100 Subject: perfcounters: IRQ and NMI support on AMD CPUs, fix The BKGD suggests that counter width on AMD CPUs is 48 for all existing models (it certainly is for mine). Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6ebe9abf6ae..f5853718d4d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - counter_value_mask = ~0ULL; - - rdmsrl(MSR_K7_PERFCTR0, old); - wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); - /* - * read the truncated mask - */ - rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); - wrmsrl(MSR_K7_PERFCTR0, old); - - bits = 32 + fls(counter_value_mask >> 32); - if (bits == 32) - bits = fls((u32)counter_value_mask); - counter_value_bits = bits; + counter_value_mask = 0x0000FFFFFFFFFFFFULL; + counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); -- cgit v1.2.3 From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 8 Mar 2009 11:34:19 +0100 Subject: x86: perf_counter cleanup Use and actual unsigned long bitmap instead of casting our way around. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput LKML-Reference: <1236508459.22914.3645.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f5853718d4d..1df421042b2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -37,7 +37,7 @@ struct cpu_hw_counters { unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - u64 active_mask; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + if (test_bit(idx, cpuc->active_mask)) { u64 val; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, (unsigned long *)&cpuc->active_mask); + set_bit(idx, cpuc->active_mask); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, (unsigned long *)&cpuc->active_mask); + clear_bit(idx, cpuc->active_mask); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.2.3 From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 8 Mar 2009 17:09:49 +0530 Subject: x86: perf_counter cleanup Remove unused variables and duplicate header file. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1df421042b2..155bc3c239b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -17,7 +17,6 @@ #include #include -#include #include static bool perf_counters_initialized __read_mostly; @@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { - u64 old; - int bits; - nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; -- cgit v1.2.3 From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:28 +0100 Subject: perf_counter: x86: fix 32-bit irq_period assumption No need to assume the irq_period is 32bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 155bc3c239b..1cedc3468ce 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s32 period = hwc->irq_period; + s64 period = hwc->irq_period; int err; /* -- cgit v1.2.3 From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:30 +0100 Subject: perf_counter: add comment to barrier We need to ensure the enabled=0 write happens before we start disabling the actual counters, so that a pcm_amd_enable() will not enable one underneath us. I think the race is impossible anyway, we always balance the ops within any one context and perform enable() with IRQs disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1cedc3468ce..a2e3b76bfdc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void) enabled = cpuc->enabled; cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that pcm_amd_enable() does the right thing. + */ barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { -- cgit v1.2.3 From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:31 +0100 Subject: perf_counter: x86: use ULL postfix for 64bit constants Fix a build warning on 32bit machines by explicitly marking the constants as 64-bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a2e3b76bfdc..22dab06c08a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event) static u64 pmc_intel_raw_event(u64 event) { -#define CORE_EVNTSEL_EVENT_MASK 0x000000FF -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event) static u64 pmc_amd_raw_event(u64 event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FF -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ -- cgit v1.2.3 From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:33 +0100 Subject: perf_counter: provide pagefault software events We use the generic software counter infrastructure to provide page fault events. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..c8725752b6c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: -- cgit v1.2.3 From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c8725752b6c..f2d3324d921 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1140,10 +1140,13 @@ good_area: return; } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + } else { tsk->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } check_v8086_mode(regs, address, tsk); -- cgit v1.2.3 From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Mar 2009 08:59:21 +0100 Subject: perf_counter: fix crash on perfmon v1 systems Impact: fix boot crash on Intel Perfmon Version 1 systems Intel Perfmon v1 does not support the global MSRs, nor does it offer the generalized MSR ranges. So support v2 and later CPUs only. Also mark pmc_ops as read-mostly - to avoid false cacheline sharing. Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22dab06c08a..6cba9d47b71 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -57,12 +57,14 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops; +static struct pmc_x86_ops *pmc_ops __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +static __read_mostly int intel_perfmon_version; + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -613,7 +615,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (intel_perfmon_version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = { static struct pmc_x86_ops *pmc_intel_init(void) { + union cpuid10_edx edx; union cpuid10_eax eax; - unsigned int ebx; unsigned int unused; - union cpuid10_edx edx; + unsigned int ebx; /* * Check whether the Architectural PerfMon supports @@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; + intel_perfmon_version = eax.split.version_id; + if (intel_perfmon_version < 2) + return NULL; + pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", eax.split.version_id); + pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); -- cgit v1.2.3 From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 16 Mar 2009 21:00:00 +1100 Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc build Impact: build fix for powerpc Commit bd753921015e7905 ("perf_counter: software counter event infrastructure") introduced a use of TIF_PERF_COUNTERS into the core perfcounter code. This breaks the build on powerpc because we use a flag in a per-cpu area to signal wakeups on powerpc rather than a thread_info flag, because the thread_info flags have to be manipulated with atomic operations and are thus slower than per-cpu flags. This fixes the by changing the core to use an abstracted set_perf_counter_pending() function, which is defined on x86 to set the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag (paca->perf_counter_pending). It changes the previous powerpc definition of set_perf_counter_pending to not take an argument and adds a clear_perf_counter_pending, so as to simplify the definition on x86. On x86, set_perf_counter_pending() is defined as a macro. Defining it as a static inline in arch/x86/include/asm/perf_counters.h causes compile failures because gets included early in , and the definitions of set_tsk_thread_flag etc. are therefore not available in . (On powerpc this problem is avoided by defining set_perf_counter_pending etc. in .) Signed-off-by: Paul Mackerras --- arch/x86/include/asm/perf_counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 2e08ed73664..1662043b340 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,9 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +#define set_perf_counter_pending() \ + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); -- cgit v1.2.3 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cba9d47b71..d844ae41d5a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw) { - hwc->config |= pmc_ops->raw_event(hw_event->type); + if (hw_event->raw_type) { + hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); } else { - if (hw_event->type >= pmc_ops->max_events) + if (hw_event->event_id >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->type); + hwc->config |= pmc_ops->event_map(hw_event->event_id); } counter->wakeup_pending = 0; @@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, counter->hw_event.event_config); perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } -- cgit v1.2.3 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 53 +------------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d844ae41d5a..902282d68b0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); } -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - /* * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: @@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter) __pmc_generic_enable(counter, hwc, idx); } -static void -perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - /* - * Store sibling timestamps (if any): - */ - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.event_config); - perf_store_irq_data(sibling, atomic64_read(&counter->count)); - } -} - /* * Maximum interrupt frequency of 100KHz per CPU */ @@ -754,28 +724,7 @@ again: continue; perf_save_and_restart(counter); - - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - continue; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter, &status, &ack); - break; - } - /* - * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these generic as - * wakeup_pending and initate a wakeup callback: - */ - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else { - wake_up(&counter->waitq); - } + perf_counter_output(counter, nmi, regs); } hw_perf_ack_status(ack); -- cgit v1.2.3 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 902282d68b0..3f95b0cdc55 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw_type) { - hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); + if (perf_event_raw(hw_event)) { + hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); } else { - if (hw_event->event_id >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->event_id); + hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } counter->wakeup_pending = 0; -- cgit v1.2.3 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 5 +++-- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/kernel/cpu/perf_counter.c | 29 ----------------------------- arch/x86/kernel/signal.c | 6 ------ 4 files changed, 4 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 1662043b340..e2b0e66b235 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,8 +84,9 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) -#define set_perf_counter_pending() \ - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); +#define set_perf_counter_pending() do { } while (0) +#define clear_perf_counter_pending() do { } while (0) +#define test_perf_counter_pending() (0) #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 3ffd5d2a367..8820a73ae09 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,7 +83,6 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ -#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -107,7 +106,6 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) -#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -141,7 +139,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f95b0cdc55..7aab177fb56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) */ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } - counter->wakeup_pending = 0; return 0; } @@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } -/* - * This handler is triggered by NMI contexts: - */ -void perf_counter_notify(struct pt_regs *regs) -{ - struct cpu_hw_counters *cpuc; - unsigned long flags; - int bit, cpu; - - local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - if (!counter) - continue; - - if (counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } - - local_irq_restore(flags); -} - void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 611615a92c9..0a813b17b17 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ -#include #include #include #include @@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PERF_COUNTERS) { - clear_thread_flag(TIF_PERF_COUNTERS); - perf_counter_notify(regs); - } - #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:09 +0200 Subject: perf_counter: x86: proper error propagation for the x86 hw_perf_counter_init() Now that Paul cleaned up the error propagation paths, pass down the x86 error as well. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.792822360@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7aab177fb56..b8885ccd804 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter) err = __hw_perf_counter_init(counter); if (err) - return NULL; + return ERR_PTR(err); return &x86_perf_counter_ops; } -- cgit v1.2.3 From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:15 +0200 Subject: perf_counter: x86: callchain support Provide the x86 perf_callchain() implementation. Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Soeren Sandmann Pedersen Cc: Frederic Weisbecker Cc: Steven Rostedt Orig-LKML-Reference: <20090330171024.341993293@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b8885ccd804..e16dfafc6d7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,8 +16,10 @@ #include #include #include +#include #include +#include static bool perf_counters_initialized __read_mostly; @@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter) return &x86_perf_counter_ops; } + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ + if (entry->nr < MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + unsigned long bp; + char *stack; + + callchain_store(entry, instruction_pointer(regs)); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = frame_pointer(regs); +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); +} + + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + regs = (struct pt_regs *)current->thread.sp0 - 1; + fp = (void __user *)regs->bp; + + callchain_store(entry, regs->ip); + + while (entry->nr < MAX_STACK_DEPTH) { + frame.next_fp = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < user_stack_pointer(regs)) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_fp; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} -- cgit v1.2.3 From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:16 +0200 Subject: perf_counter: pmc arbitration Follow the example set by powerpc and try to play nice with oprofile and the nmi watchdog. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.459968444@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16dfafc6d7..2a946a160ca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -20,6 +20,7 @@ #include #include +#include static bool perf_counters_initialized __read_mostly; @@ -172,6 +173,65 @@ again: atomic64_sub(delta, &hwc->period_left); } +static atomic_t num_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(pmc_ops->eventsel + i); + + i = nr_counters_generic; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(pmc_ops->perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < nr_counters_generic; i++) { + release_perfctr_nmi(pmc_ops->perfctr + i); + release_evntsel_nmi(pmc_ops->eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; + int err; if (unlikely(!perf_counters_initialized)) return -EINVAL; + err = 0; + if (atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } + counter->destroy = hw_perf_counter_destroy; + return 0; } -- cgit v1.2.3 From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2a946a160ca..c74e20d593a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { unsigned long bp; char *stack; + int nr = entry->nr; callchain_store(entry, instruction_pointer(regs)); @@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) #endif dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; } @@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; + int nr = entry->nr; regs = (struct pt_regs *)current->thread.sp0 - 1; fp = (void __user *)regs->bp; @@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_fp; } + + entry->user = entry->nr - nr; } static void @@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; perf_do_callchain(regs, entry); -- cgit v1.2.3 From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:03 +0200 Subject: perf_counter: x86: self-IPI for pending work Implement set_perf_counter_pending() with a self-IPI so that it will run ASAP in a usable context. For now use a second IRQ vector, because the primary vector pokes the apic in funny ways that seem to confuse things. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.724626696@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 +++++ arch/x86/include/asm/perf_counter.h | 3 ++- arch/x86/kernel/cpu/perf_counter.c | 14 ++++++++++++++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 5 +++++ arch/x86/kernel/irqinit_32.c | 1 + arch/x86/kernel/irqinit_64.c | 1 + 10 files changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf25..fe24d280249 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) #ifdef CONFIG_PERF_COUNTERS BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif #ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 25454427cee..f5ebe2aaca4 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif unsigned int generic_irqs; /* arch dependent */ unsigned int apic_perf_irqs; + unsigned int apic_pending_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ae80f64973e..7309c0ad690 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void); extern void generic_interrupt(void); extern void error_interrupt(void); extern void perf_counter_interrupt(void); +extern void perf_pending_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47..545bb811ccb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -116,6 +116,11 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +/* + * Performance monitoring pending work vector: + */ +#define LOCAL_PENDING_VECTOR 0xec + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index e2b0e66b235..d08dd52cb8f 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,7 +84,8 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) -#define set_perf_counter_pending() do { } while (0) +extern void set_perf_counter_pending(void); + #define clear_perf_counter_pending() do { } while (0) #define test_perf_counter_pending() (0) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c74e20d593a..438415866fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3f129d963a0..1d46cba56fd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ #ifdef CONFIG_PERF_COUNTERS apicinterrupt LOCAL_PERF_VECTOR \ perf_counter_interrupt smp_perf_counter_interrupt +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9c2754302ec..d465487da58 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "PND: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "PLT: "); @@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 925d87cfc55..3190a6b961e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -166,6 +166,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif # ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 665e2ab48ab..53ceb26f80f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -156,6 +156,7 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.2.3 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 438415866fe..1116a41bc7b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,8 @@ again: continue; perf_save_and_restart(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + __pmc_generic_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); -- cgit v1.2.3 From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Apr 2009 11:30:17 +0200 Subject: x86, perfcounters: add atomic64_xchg() Complete atomic64_t support on the 32-bit side by adding atomic64_xch(). Cc: Peter Zijlstra LKML-Reference: <20090406094518.445450972@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 977250ed8b8..aff9f1fcdcd 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -291,19 +291,37 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, } /** - * atomic64_set - set atomic64 variable + * atomic64_xchg - xchg atomic64 variable * @ptr: pointer to type atomic64_t * @new_val: value to assign + * @old_val: old value that was there * - * Atomically sets the value of @ptr to @new_val. + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. */ -static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) + +static inline unsigned long long +atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) { unsigned long long old_val; do { old_val = atomic_read(ptr); } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return old_val; +} + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) +{ + atomic64_xchg(ptr, new_val); } /** -- cgit v1.2.3 From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 25 Mar 2009 10:50:34 +0900 Subject: x86: smarten /proc/interrupts output for new counters Now /proc/interrupts of tip tree has new counters: CNT: Performance counter interrupts Format change of output, as like that by commit: commit 7a81d9a7da03d2f27840d659f97ef140d032f609 x86: smarten /proc/interrupts output should be applied to these new counters too. Signed-off-by: Hidetoshi Seto Cc: Jan Beulich LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d465487da58..dccaaa85578 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "CNT: "); + seq_printf(p, "%*s: ", prec, "CNT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); -- cgit v1.2.3 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/mm/fault.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1116a41bc7b..0fcbaab83f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,7 @@ again: continue; perf_save_and_restart(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, 0)) __pmc_generic_disable(counter, &counter->hw, bit); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f2d3324d921..6f9df2babe4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1142,10 +1142,12 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } check_v8086_mode(regs, address, tsk); -- cgit v1.2.3 From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 13 Apr 2009 20:24:50 +0100 Subject: perf_counter: fix alignment in /proc/interrupts Trivial fix on columns alignment in /proc/interrupts file. Signed-off-by: Luis Henriques Cc: Peter Zijlstra LKML-Reference: <20090413192449.GA3920@hades.domain.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dccaaa85578..849cfabb1fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); - seq_printf(p, "PND: "); + seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); -- cgit v1.2.3 From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:58 +0200 Subject: perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not work on AMD CPUs. The flag is now only used in Intel specific code (especially initialization). [ Impact: refactor code ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ---- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd69c514ca2..7e4a459daa6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - /* Enable Performance counter for K7 and later */ - if (c->x86 > 6 && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0fcbaab83f9..7d0f81dcb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void) unsigned int unused; unsigned int ebx; + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return NULL; + /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. @@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void) void __init init_hw_perf_counters(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); -- cgit v1.2.3 From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:00 +0200 Subject: perf_counter, x86: add default path to cpu detection This quits hw counter initialization immediately if no cpu is detected. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d0f81dcb52..d6d6529349d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_AMD: pmc_ops = pmc_amd_init(); break; + default: + return; } if (!pmc_ops) return; -- cgit v1.2.3 From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:01 +0200 Subject: perf_counter, x86: rework pmc_amd_save_disable_all() and pmc_amd_restore_all() MSR reads and writes are expensive. This patch adds checks to avoid its usage where possible. [ Impact: micro-optimization on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d6d6529349d..75a090394b6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void) for (idx = 0; idx < nr_counters_generic; idx++) { u64 val; + if (!test_bit(idx, cpuc->active_mask)) + continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } return enabled; @@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, cpuc->active_mask)) { - u64 val; + u64 val; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + continue; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } } -- cgit v1.2.3 From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:02 +0200 Subject: perf_counter, x86: protect per-cpu variables with compile barriers only Per-cpu variables needn't to be protected with cpu barriers (smp_wmb()). Protection is only needed for preemption on the same cpu (rescheduling or the nmi handler). This can be done using a compiler barrier only. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75a090394b6..ad663d5ad2d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -673,7 +673,7 @@ try_generic: /* * Make it visible before enabling the hw: */ - smp_wmb(); + barrier(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter) * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: */ - smp_wmb(); + barrier(); /* * Drain the remaining delta count out of a counter -- cgit v1.2.3 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ad663d5ad2d..95de980c74a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__pmc_generic_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) +__x86_pmu_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter, } static void -__pmc_generic_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +__x86_pmu_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); @@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static int pmc_generic_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -667,7 +667,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; /* @@ -676,7 +676,7 @@ try_generic: barrier(); __hw_perf_counter_set_period(counter, hwc, idx); - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); return 0; } @@ -731,13 +731,13 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void pmc_generic_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); } /* @@ -805,7 +805,7 @@ again: perf_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __pmc_generic_disable(counter, &counter->hw, bit); + __x86_pmu_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); @@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void pmc_generic_read(struct perf_counter *counter) +static void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .enable = pmc_generic_enable, - .disable = pmc_generic_disable, - .read = pmc_generic_read, +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, }; -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { int err; @@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &x86_perf_counter_ops; + return &pmu; } /* -- cgit v1.2.3 From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:04 +0200 Subject: perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu This patch renames struct pmc_x86_ops into struct x86_pmu. It introduces a structure to describe an x86 model specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 95de980c74a..808a1a11346 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,9 +44,9 @@ struct cpu_hw_counters { }; /* - * struct pmc_x86_ops - performance counter x86 ops + * struct x86_pmu - generic x86 pmu */ -struct pmc_x86_ops { +struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -60,7 +60,7 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops __read_mostly; +static struct x86_pmu *x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static u64 pmc_intel_event_map(int event) +static u64 intel_pmu_event_map(int event) { return intel_perfmon_event_map[event]; } -static u64 pmc_intel_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static u64 pmc_amd_event_map(int event) +static u64 amd_pmu_event_map(int event) { return amd_perfmon_event_map[event]; } -static u64 pmc_amd_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(pmc_ops->eventsel + i); + release_evntsel_nmi(x86_pmu->eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(pmc_ops->perfctr + i); + release_perfctr_nmi(x86_pmu->perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(pmc_ops->perfctr + i); - release_evntsel_nmi(pmc_ops->eventsel + i); + release_perfctr_nmi(x86_pmu->perfctr + i); + release_evntsel_nmi(x86_pmu->eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= x86_pmu->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 pmc_intel_save_disable_all(void) +static u64 intel_pmu_save_disable_all(void) { u64 ctrl; @@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } -static u64 pmc_amd_save_disable_all(void) +static u64 amd_pmu_save_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int enabled, idx; @@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that pcm_amd_enable() does the right thing. + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. */ barrier(); @@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->save_disable_all(); + return x86_pmu->save_disable_all(); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void pmc_intel_restore_all(u64 ctrl) +static void intel_pmu_restore_all(u64 ctrl) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } -static void pmc_amd_restore_all(u64 ctrl) +static void amd_pmu_restore_all(u64 ctrl) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; @@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->restore_all(ctrl); + x86_pmu->restore_all(ctrl); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 pmc_intel_get_status(u64 mask) +static u64 intel_pmu_get_status(u64 mask) { u64 status; @@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask) return status; } -static u64 pmc_amd_get_status(u64 mask) +static u64 amd_pmu_get_status(u64 mask) { u64 status = 0; int idx; @@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->get_status(mask); + return x86_pmu->get_status(mask); } -static void pmc_intel_ack_status(u64 ack) +static void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void pmc_amd_ack_status(u64 ack) +static void amd_pmu_ack_status(u64 ack) { } @@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->ack_status(ack); + x86_pmu->ack_status(ack); } -static void pmc_intel_enable(int idx, u64 config) +static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void pmc_amd_enable(int idx, u64 config) +static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->enable(idx, config); + x86_pmu->enable(idx, config); } -static void pmc_intel_disable(int idx, u64 config) +static void intel_pmu_disable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); } -static void pmc_amd_disable(int idx, u64 config) +static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->disable(idx, config); + x86_pmu->disable(idx, config); } static inline void @@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -661,8 +662,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = pmc_ops->eventsel; - hwc->counter_base = pmc_ops->perfctr; + hwc->config_base = x86_pmu->eventsel; + hwc->counter_base = x86_pmu->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -710,8 +711,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); - rdmsrl(pmc_ops->perfctr + idx, pmc_count); + rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -static struct pmc_x86_ops pmc_intel_ops = { - .save_disable_all = pmc_intel_save_disable_all, - .restore_all = pmc_intel_restore_all, - .get_status = pmc_intel_get_status, - .ack_status = pmc_intel_ack_status, - .enable = pmc_intel_enable, - .disable = pmc_intel_disable, +static struct x86_pmu intel_pmu = { + .save_disable_all = intel_pmu_save_disable_all, + .restore_all = intel_pmu_restore_all, + .get_status = intel_pmu_get_status, + .ack_status = intel_pmu_ack_status, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = pmc_intel_event_map, - .raw_event = pmc_intel_raw_event, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; -static struct pmc_x86_ops pmc_amd_ops = { - .save_disable_all = pmc_amd_save_disable_all, - .restore_all = pmc_amd_restore_all, - .get_status = pmc_amd_get_status, - .ack_status = pmc_amd_ack_status, - .enable = pmc_amd_enable, - .disable = pmc_amd_disable, +static struct x86_pmu amd_pmu = { + .save_disable_all = amd_pmu_save_disable_all, + .restore_all = amd_pmu_restore_all, + .get_status = amd_pmu_get_status, + .ack_status = amd_pmu_ack_status, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, - .event_map = pmc_amd_event_map, - .raw_event = pmc_amd_raw_event, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct pmc_x86_ops *pmc_intel_init(void) +static struct x86_pmu *intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &pmc_intel_ops; + return &intel_pmu; } -static struct pmc_x86_ops *pmc_amd_init(void) +static struct x86_pmu *amd_pmu_init(void) { nr_counters_generic = 4; nr_counters_fixed = 0; @@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void) pr_info("AMD Performance Monitoring support detected.\n"); - return &pmc_amd_ops; + return &amd_pmu; } void __init init_hw_perf_counters(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - pmc_ops = pmc_intel_init(); + x86_pmu = intel_pmu_init(); break; case X86_VENDOR_AMD: - pmc_ops = pmc_amd_init(); + x86_pmu = amd_pmu_init(); break; default: return; } - if (!pmc_ops) + if (!x86_pmu) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.2.3 From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:05 +0200 Subject: perf_counter, x86: make interrupt handler model specific This separates the perfcounter interrupt handler for AMD and Intel cpus. The AMD interrupt handler implementation is a follow-on patch. [ Impact: refactor and clean up code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 808a1a11346..9d90de0bd0b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -4,6 +4,7 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * Copyright(C) 2009 Jaswinder Singh Rajput + * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter * * For licencing details see kernel-base/COPYING */ @@ -47,6 +48,7 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; + /* disable temporarily */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -ENOSYS; + if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; @@ -827,6 +833,8 @@ out: return ret; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } + void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; @@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - __smp_perf_counter_interrupt(regs, 0); + x86_pmu->handle_irq(regs, 0); irq_exit(); } @@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = __smp_perf_counter_interrupt(regs, 1); + ret = x86_pmu->handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, .get_status = intel_pmu_get_status, @@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, .get_status = amd_pmu_get_status, -- cgit v1.2.3 From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:06 +0200 Subject: perf_counter, x86: remove get_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 39 +++++--------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9d90de0bd0b..d0bb02919c6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - u64 (*get_status)(u64); void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); @@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static u64 amd_pmu_get_status(u64 mask) -{ - u64 status = 0; - int idx; - - for (idx = 0; idx < nr_counters_generic; idx++) { - s64 val; - - if (!(mask & (1 << idx))) - continue; - - rdmsrl(MSR_K7_PERFCTR0 + idx, val); - val <<= (64 - counter_value_bits); - if (val >= 0) - status |= (1 << idx); - } - - return status; -} - -static u64 hw_perf_get_status(u64 mask) -{ if (unlikely(!perf_counters_initialized)) return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - return x86_pmu->get_status(mask); + return status; } static void intel_pmu_ack_status(u64 ack) @@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = hw_perf_save_disable(); - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) goto out; @@ -820,7 +793,7 @@ again: /* * Repeat if there is more work to be done: */ - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .get_status = intel_pmu_get_status, .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, @@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .get_status = amd_pmu_get_status, .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, -- cgit v1.2.3 From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:07 +0200 Subject: perf_counter, x86: remove ack_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d0bb02919c6..6bbdc16cc69 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); unsigned eventsel; @@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask) return status; } -static void intel_pmu_ack_status(u64 ack) +static inline void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void amd_pmu_ack_status(u64 ack) -{ -} - -static void hw_perf_ack_status(u64 ack) -{ - if (unlikely(!perf_counters_initialized)) - return; - - x86_pmu->ack_status(ack); -} - static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, @@ -788,7 +775,7 @@ again: __x86_pmu_disable(counter, &counter->hw, bit); } - hw_perf_ack_status(ack); + intel_pmu_ack_status(ack); /* * Repeat if there is more work to be done: @@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, -- cgit v1.2.3 From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:08 +0200 Subject: perf_counter, x86: rename __hw_perf_counter_set_period into x86_perf_counter_set_period [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6bbdc16cc69..fa6541d781b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * To be called with the counter disabled in hw: */ static void -__hw_perf_counter_set_period(struct perf_counter *counter, +x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); @@ -642,7 +642,7 @@ try_generic: */ barrier(); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); return 0; @@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter) int idx = hwc->idx; x86_perf_counter_update(counter, hwc, idx); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) __x86_pmu_enable(counter, hwc, idx); -- cgit v1.2.3 From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:09 +0200 Subject: perf_counter, x86: rename intel only functions [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fa6541d781b..5a52d73ccfa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void perf_save_and_restart(struct perf_counter *counter) +static void intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; @@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); int ret = 0; - cpuc->throttle_ctrl = hw_perf_save_disable(); + cpuc->throttle_ctrl = intel_pmu_save_disable_all(); status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) @@ -770,7 +770,7 @@ again: if (!counter) continue; - perf_save_and_restart(counter); + intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) __x86_pmu_disable(counter, &counter->hw, bit); } @@ -788,7 +788,7 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - hw_perf_restore(cpuc->throttle_ctrl); + intel_pmu_restore_all(cpuc->throttle_ctrl); return ret; } -- cgit v1.2.3 From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:10 +0200 Subject: perf_counter, x86: modify initialization of struct x86_pmu This patch adds an error handler and changes initialization of struct x86_pmu. No functional changes. Needed for follow-on patches. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a52d73ccfa..7c72a942363 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = { .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct x86_pmu *intel_pmu_init(void) +static int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void) unsigned int ebx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return NULL; + return -ENODEV; /* * Check whether the Architectural PerfMon supports @@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void) */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return NULL; + return -ENODEV; intel_perfmon_version = eax.split.version_id; if (intel_perfmon_version < 2) - return NULL; + return -ENODEV; pr_info("Intel Performance Monitoring support detected.\n"); pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); + x86_pmu = &intel_pmu; + nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &intel_pmu; + return 0; } -static struct x86_pmu *amd_pmu_init(void) +static int amd_pmu_init(void) { + x86_pmu = &amd_pmu; + nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); - - return &amd_pmu; + return 0; } void __init init_hw_perf_counters(void) { + int err; + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - x86_pmu = intel_pmu_init(); + err = intel_pmu_init(); break; case X86_VENDOR_AMD: - x86_pmu = amd_pmu_init(); + err = amd_pmu_init(); break; default: return; } - if (!x86_pmu) + if (err != 0) return; pr_info("... num counters: %d\n", nr_counters_generic); -- cgit v1.2.3 From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:11 +0200 Subject: perf_counter, x86: make x86_pmu data a static struct Instead of using a pointer to reference to the x86 pmu we now have one single data structure that is initialized at the beginning. This saves the pointer access when using this memory. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7c72a942363..68597d76338 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -60,7 +60,7 @@ struct x86_pmu { int max_events; }; -static struct x86_pmu *x86_pmu __read_mostly; +static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu->eventsel + i); + release_evntsel_nmi(x86_pmu.eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu->perfctr + i); + release_perfctr_nmi(x86_pmu.perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(x86_pmu->perfctr + i); - release_evntsel_nmi(x86_pmu->eventsel + i); + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= x86_pmu->max_events) + if (perf_event_id(hw_event) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return x86_pmu->save_disable_all(); + return x86_pmu.save_disable_all(); } /* * Exported because of ACPI idle @@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->restore_all(ctrl); + x86_pmu.restore_all(ctrl); } /* * Exported because of ACPI idle @@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->enable(idx, config); + x86_pmu.enable(idx, config); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->disable(idx, config); + x86_pmu.disable(idx, config); } static inline void @@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -628,8 +628,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = x86_pmu->eventsel; - hwc->counter_base = x86_pmu->perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -677,8 +677,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu->perfctr + idx, pmc_count); + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - x86_pmu->handle_irq(regs, 0); + x86_pmu.handle_irq(regs, 0); irq_exit(); } @@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu->handle_irq(regs, 1); + ret = x86_pmu.handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -940,7 +940,7 @@ static int intel_pmu_init(void) pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = &intel_pmu; + x86_pmu = intel_pmu; nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -951,7 +951,7 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { - x86_pmu = &amd_pmu; + x86_pmu = amd_pmu; nr_counters_generic = 4; nr_counters_fixed = 0; -- cgit v1.2.3 From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:12 +0200 Subject: perf_counter, x86: move counter parameters to struct x86_pmu [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 68597d76338..75dbb1f0900 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,16 +24,7 @@ #include static bool perf_counters_initialized __read_mostly; - -/* - * Number of (generic) HW counters: - */ -static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; -static u64 counter_value_mask __read_mostly; -static int counter_value_bits __read_mostly; - -static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; @@ -58,6 +49,10 @@ struct x86_pmu { u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; }; static struct x86_pmu x86_pmu __read_mostly; @@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -199,7 +194,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = nr_counters_generic; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -215,7 +210,7 @@ static void release_pmc_hardware(void) { int i; - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void) */ barrier(); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) if (!ctrl) return; - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->prev_count, (u64)-left); err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & counter_value_mask); + (u64)(-left) & x86_pmu.counter_mask); } static inline void @@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter) /* Try to get the previous generic counter again */ if (test_and_set_bit(idx, cpuc->used)) { try_generic: - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) + idx = find_first_zero_bit(cpuc->used, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) return -EAGAIN; set_bit(idx, cpuc->used); @@ -654,7 +650,7 @@ void perf_counter_print_debug(void) struct cpu_hw_counters *cpuc; int cpu, idx; - if (!nr_counters_generic) + if (!x86_pmu.num_counters) return; local_irq_disable(); @@ -676,7 +672,7 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -689,7 +685,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < nr_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, }; static int intel_pmu_init(void) @@ -941,10 +940,10 @@ static int intel_pmu_init(void) pr_info("... mask length: %d\n", eax.split.mask_length); x86_pmu = intel_pmu; - - nr_counters_generic = eax.split.num_counters; - nr_counters_fixed = edx.split.num_counters_fixed; - counter_value_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; return 0; } @@ -952,12 +951,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - - nr_counters_generic = 4; - nr_counters_fixed = 0; - counter_value_mask = 0x0000FFFFFFFFFFFFULL; - counter_value_bits = 48; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void) if (err != 0) return; - pr_info("... num counters: %d\n", nr_counters_generic); - if (nr_counters_generic > X86_PMC_MAX_GENERIC) { - nr_counters_generic = X86_PMC_MAX_GENERIC; + pr_info("... num counters: %d\n", x86_pmu.num_counters); + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_counters_generic, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_counters_generic) - 1; - perf_max_counters = nr_counters_generic; + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - if (nr_counters_fixed > X86_PMC_MAX_FIXED) { - nr_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); - perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; -- cgit v1.2.3 From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:13 +0200 Subject: perf_counter, x86: make pmu version generic This makes the use of the version variable generic. Also, some debug messages have been generalized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75dbb1f0900..15d2c03e16f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -39,6 +39,8 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + const char *name; + int version; int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); @@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; -static __read_mostly int intel_perfmon_version; - /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -658,7 +658,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (intel_perfmon_version >= 2) { + if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .name = "Intel", .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, @@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, @@ -918,6 +920,7 @@ static int intel_pmu_init(void) union cpuid10_eax eax; unsigned int unused; unsigned int ebx; + int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return -ENODEV; @@ -930,16 +933,12 @@ static int intel_pmu_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return -ENODEV; - intel_perfmon_version = eax.split.version_id; - if (intel_perfmon_version < 2) + version = eax.split.version_id; + if (version < 2) return -ENODEV; - pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", intel_perfmon_version); - pr_info("... bit width: %d\n", eax.split.bit_width); - pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = intel_pmu; + x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; x86_pmu.counter_bits = eax.split.bit_width; @@ -951,7 +950,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void) if (err != 0) return; + pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; -- cgit v1.2.3 From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:14 +0200 Subject: perf_counter, x86: make x86_pmu_read() static inline [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 15d2c03e16f..3f3ae477a7d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -- cgit v1.2.3 From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:15 +0200 Subject: perf_counter, x86: rename cpuc->active_mask This is to have a consistent naming scheme with cpuc->used. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f3ae477a7d..9ec51a662db 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active_mask); + set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, cpuc->active_mask); + clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } -- cgit v1.2.3 From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:16 +0200 Subject: perf_counter, x86: generic use of cpuc->active cpuc->active will now be used to indicate an enabled counter which implies also valid pointers of cpuc->counters[]. In contrast, cpuc->used only locks the counter, but it can be still uninitialized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9ec51a662db..f7fd4a35515 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config) static void amd_pmu_disable_counter(int idx, u64 config) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } @@ -633,10 +629,7 @@ try_generic: __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - /* - * Make it visible before enabling the hw: - */ - barrier(); + set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); @@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active); __x86_pmu_disable(counter, hwc, idx); - clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; /* * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: @@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used); } /* @@ -763,7 +761,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!counter) + if (!test_bit(bit, cpuc->active)) continue; intel_pmu_save_and_restart(counter); -- cgit v1.2.3 From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7fd4a35515..d8beebeb270 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config) static inline void __pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter, static inline void __x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, static inline void __pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - unsigned int idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler -- cgit v1.2.3 From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:18 +0200 Subject: perf_counter, x86: rework counter enable functions There is vendor specific code in generic x86 code, and there is vendor specific code that could be generic. This patch introduces x86_pmu_enable_counter() for x86 generic code. Fixed counter code for Intel is moved to Intel only functions. In the end, checks and calls via function pointers were reduced to the necessary. Also, the internal function i/f changed. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d8beebeb270..ae55933ce79 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,7 +44,7 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*enable)(int, u64); + void (*enable)(struct hw_perf_counter *, int); void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; @@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_enable_counter(int idx, u64 config) +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, - config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static void amd_pmu_enable_counter(int idx, u64 config) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - config |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); -} + int err; -static void hw_perf_enable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.enable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, } static inline void -__pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter, err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void -__x86_pmu_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_enable(counter, hwc, idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); else - hw_perf_enable(idx, hwc->config); + amd_pmu_disable_counter(idx, hwc->config); } static int @@ -632,7 +628,7 @@ try_generic: set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); - __x86_pmu_enable(counter, hwc, idx); + x86_pmu.enable(hwc, idx); return 0; } @@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __x86_pmu_enable(counter, hwc, idx); + intel_pmu_enable_counter(hwc, idx); } /* -- cgit v1.2.3 From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:19 +0200 Subject: perf_counter, x86: rework counter disable functions As for the enable function, this patch reworks the disable functions and introduces x86_pmu_disable_counter(). The internal function i/f in struct x86_pmu changed too. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ae55933ce79..df9012bbd21 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -45,7 +45,7 @@ struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); void (*enable)(struct hw_perf_counter *, int); - void (*disable)(int, u64); + void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void intel_pmu_disable_counter(int idx, u64 config) +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); -} - -static void amd_pmu_disable_counter(int idx, u64 config) -{ - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); - -} + int err; -static void hw_perf_disable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.disable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config); } static inline void -__pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_disable(counter, hwc, idx); - else - hw_perf_disable(idx, hwc->config); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); else - amd_pmu_disable_counter(idx, hwc->config); + x86_pmu_disable_counter(hwc, idx); } static int @@ -622,7 +620,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; set_bit(idx, cpuc->active); @@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * could reenable again: */ clear_bit(idx, cpuc->active); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); /* * Make sure the cleared pointer becomes visible before we @@ -762,7 +760,7 @@ again: intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __x86_pmu_disable(counter, &counter->hw, bit); + intel_pmu_disable_counter(&counter->hw, bit); } intel_pmu_ack_status(ack); -- cgit v1.2.3 From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:20 +0200 Subject: perf_counter, x86: change and remove pmu initialization checks Some functions are only called if the pmu was proper initialized. That initalization checks can be removed. The way to check initialization changed too. Now, the pointer to the interrupt handler is checked. If it exists the pmu is initialized. This also removes a static variable and uses struct x86_pmu as only data source for the check. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index df9012bbd21..2d3681bbb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -23,7 +23,6 @@ #include #include -static bool perf_counters_initialized __read_mostly; static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { @@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return -ENOSYS; - if (unlikely(!perf_counters_initialized)) - return -EINVAL; + if (!x86_pmu_initialized()) + return -ENODEV; err = 0; if (atomic_inc_not_zero(&num_counters)) { @@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void) u64 hw_perf_save_disable(void) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return 0; - return x86_pmu.save_disable_all(); } /* @@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl) void hw_perf_restore(u64 ctrl) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); } /* @@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - if (unlikely(!perf_counters_initialized)) - return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; @@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } @@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config); } @@ -787,10 +779,10 @@ void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!x86_pmu_initialized()) return; - if (unlikely(!perf_counters_initialized)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi) { u32 apic_val; - if (!perf_counters_initialized) + if (!x86_pmu_initialized()) return; + /* * Enable the performance counter vector in the APIC LVT: */ @@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_initialized = true; perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.2.3 From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:21 +0200 Subject: perf_counter, x86: implement the interrupt handler for AMD cpus This patch implements the interrupt handler for AMD performance counters. In difference to the Intel pmu, there is no single status register and also there are no fixed counters. This makes the handler very different and it is useful to make the handler vendor specific. To check if a counter is overflowed the upper bit of the counter is checked. Only counters where the active bit is set are checked. With this patch throttling is enabled for AMD performance counters. This patch also reenables Linux performance counters on AMD cpus. [ Impact: re-enable perfcounters on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2d3681bbb52..f4d59d4cf3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; - /* disable temporarily */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return -ENOSYS; - if (!x86_pmu_initialized()) return -ENODEV; @@ -773,7 +769,43 @@ out: return ret; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +{ + int cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + u64 val; + int handled = 0; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx; + + ++cpuc->interrupts; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active)) + continue; + counter = cpuc->counters[idx]; + hwc = &counter->hw; + x86_perf_counter_update(counter, hwc, idx); + val = atomic64_read(&hwc->prev_count); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + /* counter overflow */ + x86_perf_counter_set_period(counter, hwc, idx); + handled = 1; + inc_irq_stat(apic_perf_irqs); + if (perf_counter_overflow(counter, nmi, regs, 0)) + amd_pmu_disable_counter(hwc, idx); + else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + /* + * do not reenable when throttled, but reload + * the register + */ + amd_pmu_disable_counter(hwc, idx); + else if (counter->state == PERF_COUNTER_STATE_ACTIVE) + amd_pmu_enable_counter(hwc, idx); + } + return handled; +} void perf_counter_unthrottle(void) { @@ -782,9 +814,6 @@ void perf_counter_unthrottle(void) if (!x86_pmu_initialized()) return; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) -- cgit v1.2.3 From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:22 +0200 Subject: perf_counter, x86: return raw count with x86_perf_counter_update() To check on AMD cpus if a counter overflows, the upper bit of the raw counter value must be checked. This value is already internally available in x86_perf_counter_update(). Now, the value is returned so that it can be used directly to check for overflows. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f4d59d4cf3f..a8a53abd706 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event) * Can only be executed on the CPU where the counter is active. * Returns the delta events processed. */ -static void +static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -165,6 +165,8 @@ again: atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; } static atomic_t num_counters; @@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; - x86_perf_counter_update(counter, hwc, idx); - val = atomic64_read(&hwc->prev_count); + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; /* counter overflow */ -- cgit v1.2.3 From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:23 +0200 Subject: perf_counter, x86: introduce max_period variable In x86 pmus the allowed counter period to programm differs. This introduces a max_period value and allows the generic implementation for all models to check the max period. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a8a53abd706..4b8715b34f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -54,6 +54,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + u64 max_period; }; static struct x86_pmu x86_pmu __read_mostly; @@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) + hwc->irq_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, }; static struct x86_pmu amd_pmu = { @@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, }; static int intel_pmu_init(void) @@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void) perf_max_counters = x86_pmu.num_counters; pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; -- cgit v1.2.3 From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:24 +0200 Subject: perf_counter, x86: remove vendor check in fixed_mode_idx() The function fixed_mode_idx() is used generically. Now it checks the num_counters_fixed value instead of the vendor to decide if fixed counters are present. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4b8715b34f8..d1c8036dcbd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (!x86_pmu.num_counters_fixed) return -1; if (unlikely(hwc->nmi)) -- cgit v1.2.3 From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:25 +0200 Subject: perf_counter, x86: remove unused function argument in intel_pmu_get_status() The mask argument is unused and thus can be removed. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d1c8036dcbd..856b0b85219 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static inline u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(void) { u64 status; @@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = intel_pmu_save_disable_all(); - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (!status) goto out; @@ -753,7 +753,7 @@ again: /* * Repeat if there is more work to be done: */ - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (status) goto again; out: -- cgit v1.2.3 From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 14:52:50 +0200 Subject: perf_counter: add/update copyrights Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 856b0b85219..47e563bfd4c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1,10 +1,11 @@ /* * Performance counter x86 architecture code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * Copyright(C) 2009 Jaswinder Singh Rajput - * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licencing details see kernel-base/COPYING */ -- cgit v1.2.3 From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 16:55:56 +0200 Subject: perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask Standardize on explicitly mentioning '_mask' in fields that are not plain flags but masks. This avoids typos like: if (cpuc->used) (which could easily slip through review unnoticed), while if a typo looks like this: if (cpuc->used_mask) it might get noticed during review. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 47e563bfd4c..fc06f4d3264 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; int enabled; @@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter) * Try to get the fixed counter, if that is already taken * then try to get a generic counter: */ - if (test_and_set_bit(idx, cpuc->used)) + if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; @@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter) } else { idx = hwc->idx; /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used)) { + if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used, + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_counters); if (idx == x86_pmu.num_counters) return -EAGAIN; - set_bit(idx, cpuc->used); + set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; @@ -609,7 +609,7 @@ try_generic: x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active); + set_bit(idx, cpuc->active_mask); x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); @@ -643,7 +643,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active); + clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); /* @@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) */ x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used); + clear_bit(idx, cpuc->used_mask); } /* @@ -741,7 +741,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active)) + if (!test_bit(bit, cpuc->active_mask)) continue; intel_pmu_save_and_restart(counter); @@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; -- cgit v1.2.3 From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:17 +0200 Subject: perf_counter: fix nmi-watchdog interaction When we don't have any perf-counters active, don't act like we know what the NMI is for. [ Impact: fix hard hang with nmi_watchdog=2 ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.109867793@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc06f4d3264..d4c0cc9d326 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; + if (!atomic_read(&num_counters)) + return NOTIFY_DONE; + switch (cmd) { case DIE_NMI: case DIE_NMI_IPI: -- cgit v1.2.3 From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 May 2009 18:47:44 +0200 Subject: perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo Invert the atomic_inc_not_zero() test so that we will indeed detect the first activation. Also rename the global num_counters, since its easy to confuse with x86_pmu.num_counters. [ Impact: fix non-working perfcounters on AMD CPUs, cleanup ] Signed-off-by: Peter Zijlstra LKML-Reference: <1241455664.7620.4938.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4c0cc9d326..196b58f0444 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -171,7 +171,7 @@ again: return new_raw_count; } -static atomic_t num_counters; +static atomic_t active_counters; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -224,7 +224,7 @@ static void release_pmc_hardware(void) static void hw_perf_counter_destroy(struct perf_counter *counter) { - if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } @@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (atomic_inc_not_zero(&num_counters)) { + if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) err = -EBUSY; else - atomic_inc(&num_counters); + atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } if (err) @@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (capable(CAP_SYS_ADMIN) && hw_event->nmi) hwc->nmi = 1; - hwc->irq_period = hw_event->irq_period; + hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) hwc->irq_period = x86_pmu.max_period; @@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; - if (!atomic_read(&num_counters)) + if (!atomic_read(&active_counters)) return NOTIFY_DONE; switch (cmd) { -- cgit v1.2.3 From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:04:09 +0200 Subject: perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON Fixed-purpose counters stopped working in a simple 'perf stat ls' run: cache references cache misses Due to: ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx() Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the fixed-purpose counters are utilized. But on v2 perfmon this field is not set (despite there being fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose counters to at least three. [ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ] Cc: Robert Richter Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 196b58f0444..a6878b0798e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -962,7 +962,13 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; -- cgit v1.2.3 From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 10 May 2009 10:53:05 +0200 Subject: perf_counter, x86: clean up throttling printk s/PERFMON/perfcounters for perfcounter interrupt throttling warning. 'perfmon' is the CPU feature name that is Intel-only, while we do throttling in a generic way. [ Impact: cleanup ] Signed-off-by: Mike Galbraith Cc: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a6878b0798e..da27419923a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -814,7 +814,7 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); + printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); hw_perf_restore(cpuc->throttle_ctrl); } cpuc->interrupts = 0; -- cgit v1.2.3 From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 08:12:51 +0200 Subject: perf_counter: fix print debug irq disable inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes: (sysrq_key_table_lock){?.....}, Don't unconditionally enable interrupts in the perf_counter_print_debug() path. [ Impact: fix potential deadlock pointed out by lockdep ] LKML-Reference: Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da27419923a..f7772ff7936 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -621,12 +621,13 @@ void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; + unsigned long flags; int cpu, idx; if (!x86_pmu.num_counters) return; - local_irq_disable(); + local_irq_save(flags); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -664,7 +665,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_enable(); + local_irq_restore(flags); } static void x86_pmu_disable(struct perf_counter *counter) -- cgit v1.2.3 From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 09:45:19 +0200 Subject: perf_counter: x86: More accurate counter update Take the counter width into account instead of assuming 32 bits. In particular Nehalem has 44 bit wide counters, and all arithmetics should happen on a 44-bit signed integer basis. [ Impact: fix rare event imprecision, warning message on Nehalem ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7772ff7936..3a92a2b2a80 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -138,7 +138,9 @@ static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - u64 prev_raw_count, new_raw_count, delta; + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; /* * Careful: an NMI might modify the previous counter value. @@ -161,9 +163,10 @@ again: * (counter-)time and add that to the generic counter. * * Careful, not all hw sign-extends above the physical width - * of the count, so we do that by clipping the delta to 32 bits: + * of the count. */ - delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); -- cgit v1.2.3 From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2009 12:54:01 +0200 Subject: perf_counter: x86: Fix throttling If counters are disabled globally when a perfcounter IRQ/NMI hits, and if we throttle in that case, we'll promote the '0' value to the next lapic IRQ and disable all perfcounters at that point, permanently ... Fix it. [ Impact: fix hung perfcounters under load ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3a92a2b2a80..88ae8cebf3c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -765,8 +765,13 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - intel_pmu_restore_all(cpuc->throttle_ctrl); + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { + intel_pmu_restore_all(cpuc->throttle_ctrl); + } else { + pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); + } + } return ret; } @@ -817,11 +822,16 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - if (printk_ratelimit()) - printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); + pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); + + /* + * Clear them before re-enabling irqs/NMIs again: + */ + cpuc->interrupts = 0; hw_perf_restore(cpuc->throttle_ctrl); + } else { + cpuc->interrupts = 0; } - cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) -- cgit v1.2.3 From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 10:02:57 +0200 Subject: perf_counter: x86: Allow unpriviliged use of NMIs Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter creation instead of silently down-grading to regular interrupts. [ Impact: allow wider perf-counter usage ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 88ae8cebf3c..c19e927b697 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * If privileged enough, allow NMI events: */ hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + if (hw_event->nmi) { + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + return -EACCES; hwc->nmi = 1; + } hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) -- cgit v1.2.3 From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 13:21:36 +0200 Subject: perf_counter: x86: Fix up the amd NMI/INT throttle perf_counter_unthrottle() restores throttle_ctrl, buts its never set. Also, we fail to disable all counters when throttling. [ Impact: fix rare stuck perf-counters when they are throttled ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c19e927b697..7601c014f8f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); + if (!enabled) + goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void) wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } +out: return enabled; } @@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) int handled = 0; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx; + int idx, throttle = 0; + + cpuc->throttle_ctrl = cpuc->enabled; + cpuc->enabled = 0; + barrier(); + + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + throttle = 1; + } - ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int disable = 0; + if (!test_bit(idx, cpuc->active_mask)) continue; + counter = cpuc->counters[idx]; hwc = &counter->hw; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; + goto next; + /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - if (perf_counter_overflow(counter, nmi, regs, 0)) - amd_pmu_disable_counter(hwc, idx); - else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - /* - * do not reenable when throttled, but reload - * the register - */ + disable = perf_counter_overflow(counter, nmi, regs, 0); + +next: + if (disable || throttle) amd_pmu_disable_counter(hwc, idx); - else if (counter->state == PERF_COUNTER_STATE_ACTIVE) - amd_pmu_enable_counter(hwc, idx); } + + if (cpuc->throttle_ctrl && !throttle) + cpuc->enabled = 1; + return handled; } -- cgit v1.2.3 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++----------------------- 1 file changed, 42 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7601c014f8f..313638cecbb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -31,7 +31,6 @@ struct cpu_hw_counters { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 throttle_ctrl; int enabled; }; @@ -42,8 +41,8 @@ struct x86_pmu { const char *name; int version; int (*handle_irq)(struct pt_regs *, int); - u64 (*save_disable_all)(void); - void (*restore_all)(u64); + void (*disable_all)(void); + void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; @@ -56,6 +55,7 @@ struct x86_pmu { int counter_bits; u64 counter_mask; u64 max_period; + u64 intel_ctrl; }; static struct x86_pmu x86_pmu __read_mostly; @@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 intel_pmu_save_disable_all(void) +static void intel_pmu_disable_all(void) { - u64 ctrl; - - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - return ctrl; } -static u64 amd_pmu_save_disable_all(void) +static void amd_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int enabled, idx; + int idx; + + if (!cpuc->enabled) + return; - enabled = cpuc->enabled; cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the @@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); - if (!enabled) - goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void) val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } - -out: - return enabled; } -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { if (!x86_pmu_initialized()) - return 0; - return x86_pmu.save_disable_all(); + return; + return x86_pmu.disable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void intel_pmu_restore_all(u64 ctrl) +static void intel_pmu_enable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } -static void amd_pmu_restore_all(u64 ctrl) +static void amd_pmu_enable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; - cpuc->enabled = ctrl; - barrier(); - if (!ctrl) + if (cpuc->enabled) return; + cpuc->enabled = 1; + barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl) } } -void hw_perf_restore(u64 ctrl) +void hw_perf_enable(void) { if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); + x86_pmu.enable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_restore); static inline u64 intel_pmu_get_status(void) { @@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - int ret = 0; - - cpuc->throttle_ctrl = intel_pmu_save_disable_all(); + perf_disable(); status = intel_pmu_get_status(); - if (!status) - goto out; + if (!status) { + perf_enable(); + return 0; + } - ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -767,19 +751,11 @@ again: status = intel_pmu_get_status(); if (status) goto again; -out: - /* - * Restore - do not reenable when global enable is off or throttled: - */ - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { - intel_pmu_restore_all(cpuc->throttle_ctrl); - } else { - pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); - } - } - return ret; + if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) + perf_enable(); + + return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) @@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) struct hw_perf_counter *hwc; int idx, throttle = 0; - cpuc->throttle_ctrl = cpuc->enabled; - cpuc->enabled = 0; - barrier(); - - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - throttle = 1; + if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { + throttle = 1; + __perf_disable(); + cpuc->enabled = 0; + barrier(); } for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -824,9 +798,6 @@ next: amd_pmu_disable_counter(hwc, idx); } - if (cpuc->throttle_ctrl && !throttle) - cpuc->enabled = 1; - return handled; } @@ -839,13 +810,11 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); - /* * Clear them before re-enabling irqs/NMIs again: */ cpuc->interrupts = 0; - hw_perf_restore(cpuc->throttle_ctrl); + perf_enable(); } else { cpuc->interrupts = 0; } @@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, - .save_disable_all = intel_pmu_save_disable_all, - .restore_all = intel_pmu_restore_all, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = { static struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, - .save_disable_all = amd_pmu_save_disable_all, - .restore_all = amd_pmu_restore_all, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, @@ -1003,6 +972,8 @@ static int intel_pmu_init(void) x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + return 0; } -- cgit v1.2.3 From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2009 14:52:17 +0200 Subject: perf_counter: x86: Robustify interrupt handling Two consecutive NMIs could daze and confuse the machine when the first would handle the overflow of both counters. [ Impact: fix false-positive syslog messages under multi-session profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 313638cecbb..1dcf67057f1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; + + if (counter->hw_event.nmi != nmi) + goto next; + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) goto next; @@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int ret; if (!atomic_read(&active_counters)) return NOTIFY_DONE; @@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu.handle_irq(regs, 1); + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs, 1); - return ret ? NOTIFY_STOP : NOTIFY_OK; + return NOTIFY_STOP; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { -- cgit v1.2.3 From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:25:22 +0200 Subject: perf_counter: x86: Disallow interval of 1 On certain CPUs i have observed a stuck PMU if interval was set to 1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS, but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL, and the NMI loop got stuck infinitely. [ Impact: fix rare hangs during high perfcounter load ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1dcf67057f1..46a82d1e4cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter, left += period; atomic64_set(&hwc->period_left, left); } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; per_cpu(prev_left[idx], smp_processor_id()) = left; -- cgit v1.2.3 From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:26:20 +0200 Subject: perf_counter: x86: Protect against infinite loops in intel_pmu_handle_irq() intel_pmu_handle_irq() can lock up in an infinite loop if the hardware does not allow the acking of irqs. Alas, this happened in testing so make this robust and emit a warning if it happens in the future. Also, clean up the IRQ handlers a bit. [ Impact: improve perfcounter irq/nmi handling robustness ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46a82d1e4cb..5a7f718eb1e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) */ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + struct cpu_hw_counters; + int bit, cpu, loops; u64 ack, status; - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); perf_disable(); status = intel_pmu_get_status(); @@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) return 0; } + loops = 0; again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + return 1; + } + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { @@ -765,13 +775,14 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu = smp_processor_id(); - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - u64 val; - int handled = 0; + int cpu, idx, throttle = 0, handled = 0; + struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx, throttle = 0; + u64 val; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { throttle = 1; -- cgit v1.2.3 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a7f718eb1e..886dcf334bc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->irq_period = hw_event->irq_period; - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) - hwc->irq_period = x86_pmu.max_period; - - atomic64_set(&hwc->period_left, hwc->irq_period); + atomic64_set(&hwc->period_left, + min(x86_pmu.max_period, hwc->irq_period)); /* * Raw event type provide the config in the event structure @@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = min(x86_pmu.max_period, hwc->irq_period); int err; /* -- cgit v1.2.3 From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 10:04:45 +0200 Subject: perf_counter, x86: fix zero irq_period counters The quirk to irq_period unearthed an unrobustness we had in the hw_counter initialization sequence: we left irq_period at 0, which was then quirked up to 2 ... which then generated a _lot_ of interrupts during 'perf stat' runs, slowed them down and skewed the counter results in general. Initialize irq_period to the maximum instead. [ Impact: fix perf stat results ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 886dcf334bc..5bfd30ab392 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } + if (!hwc->irq_period) + hwc->irq_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, min(x86_pmu.max_period, hwc->irq_period)); -- cgit v1.2.3 From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 19:37:25 +0200 Subject: perf_counter, x86: speed up the scheduling fast-path We have to set up the LVT entry only at counter init time, not at every switch-in time. There's friction between NMI and non-NMI use here - we'll probably remove the per counter configurability of it - but until then, dont slow down things ... [ Impact: micro-optimization ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5bfd30ab392..c109819c2cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } + perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -603,8 +604,6 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); - x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(1); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2009 20:13:28 +0200 Subject: perf_counter: Fix context removal deadlock Disable the PMU globally before removing a counter from a context. This fixes the following lockup: [22081.741922] ------------[ cut here ]------------ [22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e() [22081.755624] Hardware name: X8DTN [22081.758903] perfcounters: irq loop stuck! [22081.762985] Modules linked in: [22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226 [22081.772432] Call Trace: [22081.774940] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.781993] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.788368] [] ? warn_slowpath_common+0x77/0xa3 [22081.794649] [] ? warn_slowpath_fmt+0x40/0x45 [22081.800696] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.807080] [] ? perf_counter_nmi_handler+0x3f/0x4a [22081.813751] [] ? notifier_call_chain+0x58/0x86 [22081.819951] [] ? notify_die+0x2d/0x32 [22081.825392] [] ? do_nmi+0x8e/0x242 [22081.830538] [] ? nmi+0x1a/0x20 [22081.835342] [] ? selinux_file_free_security+0x0/0x1a [22081.842105] [] ? x86_pmu_disable_counter+0x15/0x41 [22081.848673] <> [] ? x86_pmu_disable+0x86/0x103 [22081.855512] [] ? __perf_counter_remove_from_context+0x0/0xfe [22081.862926] [] ? counter_sched_out+0x30/0xce [22081.868909] [] ? __perf_counter_remove_from_context+0x59/0xfe [22081.876382] [] ? smp_call_function_single+0x6c/0xe6 [22081.882955] [] ? perf_release+0x86/0x14c [22081.888600] [] ? __fput+0xe7/0x195 [22081.893718] [] ? filp_close+0x5b/0x62 [22081.899107] [] ? put_files_struct+0x64/0xc2 [22081.905031] [] ? do_exit+0x1e2/0x6ef [22081.910360] [] ? _spin_lock_irqsave+0x9/0xe [22081.916292] [] ? do_group_exit+0x67/0x93 [22081.921953] [] ? sys_exit_group+0x12/0x16 [22081.927759] [] ? system_call_fastpath+0x16/0x1b [22081.934076] ---[ end trace 3a3936ce3e1b4505 ]--- And could potentially also fix the lockup reported by Marcelo Tosatti. Also, print more debug info in case of a detected lockup. [ Impact: fix lockup ] Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c109819c2cb..6cc1660db8d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); return 1; } -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a90802..b4f64402a82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include -- cgit v1.2.3 From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:03 +0200 Subject: perf_counter: x86: Expose INV and EDGE bits Expose the INV and EDGE bits of the PMU to raw configs. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.494709027@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cc1660db8d..c14437faf5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ CORE_EVNTSEL_COUNTER_MASK) return event & CORE_EVNTSEL_MASK; @@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ K7_EVNTSEL_COUNTER_MASK) return event & K7_EVNTSEL_MASK; -- cgit v1.2.3 From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++---------------------------------- 2 files changed, 5 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f64402a82..89b63b5fad3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437faf5d..8c8177f859f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } -/* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); -- cgit v1.2.3 From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8c8177f859f..c4b543d1a86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -623,6 +623,18 @@ try_generic: return 0; } +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; @@ -1038,6 +1050,7 @@ static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, }; const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -- cgit v1.2.3 From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 21:41:28 +0200 Subject: Revert "perf_counter, x86: speed up the scheduling fast-path" This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740. It is causing problems (stuck/stuttering profiling) - when mixed NMI and non-NMI counters are used. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c4b543d1a86..189bf9d7cda 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } - perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -612,6 +611,8 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } + perf_counters_lapic_init(hwc->nmi); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(1); + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Fix APIC NMI programming My Nehalem box locks up in certain situations (with an always-asserted NMI causing a lockup) if the PMU LVT entry is programmed between NMI and IRQ mode with a high frequency. Standardize exlusively on NMIs instead. [ Impact: fix lockup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 189bf9d7cda..ece3813c7a3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; /* - * If privileged enough, allow NMI events: + * Use NMI events all the time: */ - hwc->nmi = 0; - if (hw_event->nmi) { - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) - return -EACCES; - hwc->nmi = 1; - } + hwc->nmi = 1; + hw_event->nmi = 1; if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - if (unlikely(hwc->nmi)) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) @@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; - if (counter->hw_event.nmi != nmi) - continue; - val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; -- cgit v1.2.3 From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter, x86: Make NMI lockups more robust We have a debug check that detects stuck NMIs and returns with the PMU disabled in the global ctrl MSR - but i managed to trigger a situation where this was not enough to deassert the NMI. So clear/reset the full PMU and keep the disable count balanced when exiting from here. This way the box produces a debug warning but stays up and is more debuggable. [ Impact: in case of PMU related bugs, recover more gracefully ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ece3813c7a3..2eeaa99add1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -750,6 +774,8 @@ again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); return 1; } -- cgit v1.2.3 From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Fri, 29 May 2009 13:28:35 +0800 Subject: perf_counter/x86: Always use NMI for performance-monitoring interrupt Always use NMI for performance-monitoring interrupt as there could be racy situations if we switch between irq and nmi mode frequently. Signed-off-by: Yong Wang LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 4 ++-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 19 +++++-------------- 3 files changed, 8 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index d08dd52cb8f..876ed97147b 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void); #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(int nmi); +extern void perf_counters_lapic_init(void); #else static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(int nmi) { } +static inline void perf_counters_lapic_init(void) { } #endif #endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 89b63b5fad3..60df2efd7c8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(0); + perf_counters_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2eeaa99add1..316b0c995f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -604,7 +604,7 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); + perf_counters_lapic_init(); x86_pmu.disable(hwc, idx); @@ -863,24 +863,15 @@ void set_perf_counter_pending(void) apic->send_IPI_self(LOCAL_PENDING_VECTOR); } -void perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(void) { - u32 apic_val; - if (!x86_pmu_initialized()) return; /* - * Enable the performance counter vector in the APIC LVT: + * Always use NMI for PMU */ - apic_val = apic_read(APIC_LVTERR); - - apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); - if (nmi) - apic_write(APIC_LVTPC, APIC_DM_NMI); - else - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - apic_write(APIC_LVTERR, apic_val); + apic_write(APIC_LVTPC, APIC_DM_NMI); } static int __kprobes @@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); } -- cgit v1.2.3 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 316b0c995f3..ec06aa5e928 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hw_event->nmi = 1; - if (!hwc->irq_period) - hwc->irq_period = x86_pmu.max_period; + if (!hwc->sample_period) + hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->irq_period)); + min(x86_pmu.max_period, hwc->sample_period)); /* * Raw event type provide the config in the event structure @@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->irq_period); + s64 period = min(x86_pmu.max_period, hwc->sample_period); int err; /* -- cgit v1.2.3 From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:27:45 +0200 Subject: perf_counter: Remove the last nmi/irq bits IRQ (non-NMI) sampling is not used anymore - remove the last few bits. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ec06aa5e928..9e144fbebd2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - /* - * Use NMI events all the time: - */ - hwc->nmi = 1; - hw_event->nmi = 1; - if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; -- cgit v1.2.3 From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:08:20 +0200 Subject: perf_counter: x86: Emulate longer sample periods Do as Power already does, emulate sample periods up to 2^63-1 by composing them of smaller values limited by hardware capabilities. Only once we wrap the software period do we generate an overflow event. Just 10 lines of new code. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9e144fbebd2..904571bea71 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; - atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->sample_period)); + atomic64_set(&hwc->period_left, hwc->sample_period); /* * Raw event type provide the config in the event structure @@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * Set the next IRQ period, based on the hwc->period_left value. * To be called with the counter disabled in hw: */ -static void +static int x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->sample_period); - int err; + s64 period = hwc->sample_period; + int err, ret = 0; /* * If we are way outside a reasoable range then just skip forward: @@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 event is left: @@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left < 2)) left = 2; + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + per_cpu(prev_left[idx], smp_processor_id()) = left; /* @@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + + return ret; } static inline void @@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + int ret; x86_perf_counter_update(counter, hwc, idx); - x86_perf_counter_set_period(counter, hwc, idx); + ret = x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) intel_pmu_enable_counter(hwc, idx); + + return ret; } static void intel_pmu_reset(void) @@ -782,7 +791,9 @@ again: if (!test_bit(bit, cpuc->active_mask)) continue; - intel_pmu_save_and_restart(counter); + if (!intel_pmu_save_and_restart(counter)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; /* counter overflow */ - x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); + if (!x86_perf_counter_set_period(counter, hwc, idx)) + continue; + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.2.3 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 904571bea71..e16e8c13132 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void) } /* - * Setup the hardware configuration for a given hw_event_type + * Setup the hardware configuration for a given attr_type */ static int __hw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; + struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; int err; @@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Count user and OS events unless requested not to. */ - if (!hw_event->exclude_user) + if (!attr->exclude_user) hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!hw_event->exclude_kernel) + if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); + if (perf_event_raw(attr)) { + hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); } else { - if (perf_event_id(hw_event) >= x86_pmu.max_events) + if (perf_event_id(attr) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(attr)); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.2.3 From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 3 Jun 2009 13:12:55 +0800 Subject: perf_counter/x86: Remove the IRQ (non-NMI) handling bits Remove the IRQ (non-NMI) handling bits as NMI will be used always. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 1 - arch/x86/include/asm/hw_irq.h | 1 - arch/x86/include/asm/irq_vectors.h | 5 ----- arch/x86/kernel/cpu/perf_counter.c | 21 ++++++--------------- arch/x86/kernel/entry_64.S | 2 -- arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/irqinit_64.c | 1 - 7 files changed, 6 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index fe24d280249..d750a10ccad 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,6 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) #ifdef CONFIG_PERF_COUNTERS -BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 7309c0ad690..4b4921d7a28 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,7 +29,6 @@ extern void apic_timer_interrupt(void); extern void generic_interrupt(void); extern void error_interrupt(void); -extern void perf_counter_interrupt(void); extern void perf_pending_interrupt(void); extern void spurious_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 545bb811ccb..4492e19f839 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -106,11 +106,6 @@ */ #define LOCAL_TIMER_VECTOR 0xef -/* - * Performance monitoring interrupt vector: - */ -#define LOCAL_PERF_VECTOR 0xee - /* * Generic system vector for platform specific use */ diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16e8c13132..12cc05ed9f4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -40,7 +40,7 @@ struct cpu_hw_counters { struct x86_pmu { const char *name; int version; - int (*handle_irq)(struct pt_regs *, int); + int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); @@ -755,7 +755,7 @@ static void intel_pmu_reset(void) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_counters *cpuc; struct cpu_hw_counters; @@ -794,7 +794,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -812,7 +812,7 @@ again: return 1; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +static int amd_pmu_handle_irq(struct pt_regs *regs) { int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; @@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, nmi, regs, 0)) + if (perf_counter_overflow(counter, 1, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void smp_perf_counter_interrupt(struct pt_regs *regs) -{ - irq_enter(); - apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); - ack_APIC_irq(); - x86_pmu.handle_irq(regs, 0); - irq_exit(); -} - void smp_perf_pending_interrupt(struct pt_regs *regs) { irq_enter(); @@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self, * If the first NMI handles both, the latter will be empty and daze * the CPU. */ - x86_pmu.handle_irq(regs, 1); + x86_pmu.handle_irq(regs); return NOTIFY_STOP; } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89100461914..7985c010f8a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_PERF_COUNTERS -apicinterrupt LOCAL_PERF_VECTOR \ - perf_counter_interrupt smp_perf_counter_interrupt apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 3190a6b961e..205bdd880d3 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -165,7 +165,6 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 53ceb26f80f..fa6ef692000 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -155,7 +155,6 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS - alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } -- cgit v1.2.3 From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2009 22:19:36 +0200 Subject: perf_counter: Fix throttling lock-up Throttling logic is broken and we can lock up with too small hw sampling intervals. Make the throttling code more robust: disable counters even if we already disabled them. ( Also clean up whitespace damage i noticed while reading various pieces of code related to throttling. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 12cc05ed9f4..8f53f3a7da2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event) #define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL -#define CORE_EVNTSEL_MASK \ +#define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ -- cgit v1.2.3 From f7b6eb3fa07269da20dbbde8ba37a0273fdbd9c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:04:51 +0200 Subject: x86: Set context.vdso before installing the mapping In order to make arch_vma_name() work from inside install_special_mapping() we need to set the context.vdso before calling it. ( This is needed for performance counters to be able to track this special executable area. ) Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/vdso/vdso32-setup.c | 6 +++++- arch/x86/vdso/vma.c | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1241f118ab5..58bc00f68b1 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } } + current->mm->context.vdso = (void *)addr; + if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } - current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); up_fail: + if (ret) + current->mm->context.vdso = NULL; + up_write(&mm->mmap_sem); return ret; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7133cdf9098..93b7a2938b2 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -115,15 +115,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto up_fail; } + current->mm->context.vdso = (void *)addr; + ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, vdso_pages); - if (ret) + if (ret) { + current->mm->context.vdso = NULL; goto up_fail; + } - current->mm->context.vdso = (void *)addr; up_fail: up_write(&mm->mmap_sem); return ret; -- cgit v1.2.3 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8f53f3a7da2..430e048f285 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(attr)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); } else { - if (perf_event_id(attr) >= x86_pmu.max_events) + if (attr->config >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(attr)); + hwc->config |= x86_pmu.event_map(attr->config); } counter->destroy = hw_perf_counter_destroy; -- cgit v1.2.3 From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++-- 1 file changed, 193 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 430e048f285..e86679fa521 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ + [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void) return x86_pmu.handle_irq != NULL; } +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + /* * Setup the hardware configuration for a given attr_type */ @@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->sample_period); + counter->destroy = hw_perf_counter_destroy; /* * Raw event type provide the config in the event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); - } else { - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - /* - * The generic map: - */ - hwc->config |= x86_pmu.event_map(attr->config); + return 0; } - counter->destroy = hw_perf_counter_destroy; + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + /* + * The generic map: + */ + hwc->config |= x86_pmu.event_map(attr->config); return 0; } @@ -989,6 +1147,33 @@ static int intel_pmu_init(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + /* + * Nehalem: + */ + switch (boot_cpu_data.x86_model) { + case 17: + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Core2 event tables\n"); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Nehalem/Corei7 event tables\n"); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Atom event tables\n"); + break; + } return 0; } -- cgit v1.2.3 From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 07:42:04 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Core2 support Fill in core2_hw_cache_event_id[] with the Core2 model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e86679fa521..b1f71ff5025 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static const u64 atom_hw_cache_event_ids -- cgit v1.2.3 From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 09:30:41 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add Atom support Fill in core2_hw_cache_event_id[] with the Atom model specific events. The events can be used in all the tools via the -e (--event) parameter, for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses". ( Note: these are straight from the Intel manuals - not tested yet.) Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b1f71ff5025..71590e09d16 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { - /* To be filled in */ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 intel_pmu_raw_event(u64 event) -- cgit v1.2.3 From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter: Clean up x86 boot messages Standardize and tidy up all the messages we print during perfcounter initialization. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 71590e09d16..0339d195a3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1298,23 +1298,22 @@ static int intel_pmu_init(void) if (version < 2) return -ENODEV; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; /* * Quirk: v2 perfmon does not report fixed-purpose counters, so * assume at least 3 counters: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); /* - * Nehalem: + * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { case 17: @@ -1322,7 +1321,7 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Core2 event tables\n"); + pr_cont("Core2 events, "); break; default: case 26: @@ -1330,14 +1329,14 @@ static int intel_pmu_init(void) sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Nehalem/Corei7 event tables\n"); + pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); - pr_info("... installed Atom event tables\n"); + pr_cont("Atom events, "); break; } return 0; @@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void) { int err; + pr_info("Performance Counters: "); + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); @@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void) default: return; } - if (err != 0) + if (err != 0) { + pr_cont("no PMU driver, software counters only.\n"); return; + } - pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_cont("%s PMU driver.\n", x86_pmu.name); - pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... generic counters: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); } static inline void x86_pmu_read(struct perf_counter *counter) -- cgit v1.2.3 From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 22:33:10 +0200 Subject: perf_counter, x86: Implement generalized cache event types, add AMD support Fill in amd_hw_cache_event_id[] with the AMD CPU specific events, for family 0x0f, 0x10 and 0x11. There's apparently no distinction between load and store events, so we only fill in the load events. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0339d195a3f..93af821ebe5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event) return event & CORE_EVNTSEL_MASK; } +static const u64 amd_0f_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + /* * AMD Performance Monitor K7 and later. */ @@ -1345,6 +1436,17 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; + + switch (boot_cpu_data.x86) { + case 0x0f: + case 0x10: + case 0x11: + memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("AMD Family 0f/10/11 events, "); + break; + } return 0; } -- cgit v1.2.3 From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Jun 2009 19:10:25 +0200 Subject: perf_counter, x86: Clean up hw_cache_event ids copies Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 93af821ebe5..56001feeffc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1409,23 +1409,20 @@ static int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case 17: memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Nehalem/Corei7 events, "); break; case 28: memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* - PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + sizeof(hw_cache_event_ids)); pr_cont("Atom events, "); break; -- cgit v1.2.3 From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 9 Jun 2009 21:15:53 +0800 Subject: perf_counter, x86: Correct some event and umask values for Intel processors Correct some event and UMASK values according to Intel SDM, in the Nehalem and Atom tables. Signed-off-by: Yong Wang Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 56001feeffc..40978aac6e0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { @@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids }, [ C(L1I ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids }, [ C(DTLB) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ }, [ C(OP_PREFETCH) ] = { -- cgit v1.2.3 From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 10 Jun 2009 17:06:12 +0800 Subject: perf_counter/x86: Fix the model number of Intel Core2 processors Fix the model number of Intel Core2 processors according to the documentation: Intel Processor Identification with the CPUID Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm Signed-off-by: Yong Wang Also-Reported-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com> [ Added two more model numbers suggested by Arnd Bergmann ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40978aac6e0..49f258537cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1407,7 +1407,10 @@ static int intel_pmu_init(void) * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { - case 17: + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.3 From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 49f258537cb..240ca563063 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (!hwc->sample_period) + if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } - atomic64_set(&hwc->period_left, hwc->sample_period); counter->destroy = hw_perf_counter_destroy; /* -- cgit v1.2.3 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 240ca563063..82a23d487f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void) */ static int intel_pmu_handle_irq(struct pt_regs *regs) { + struct perf_sample_data data; struct cpu_hw_counters *cpuc; - struct cpu_hw_counters; int bit, cpu, loops; u64 ack, status; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1210,7 +1213,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1230,12 +1233,16 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { - int cpu, idx, handled = 0; + struct perf_sample_data data; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; + int cpu, idx, handled = 0; u64 val; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) amd_pmu_disable_counter(hwc, idx); } -- cgit v1.2.3 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82a23d487f9..57ae1bec81b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); } @@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } /* @@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; - /* counter overflow */ - handled = 1; - inc_irq_stat(apic_perf_irqs); + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; @@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) amd_pmu_disable_counter(hwc, idx); } + if (handled) + inc_irq_stat(apic_perf_irqs); + return handled; } -- cgit v1.2.3 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 32 ++++++++++++++++---------------- arch/x86/mm/fault.c | 6 +++--- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 57ae1bec81b..572fb434a66 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { */ static const u64 intel_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x003c, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_CACHE_MISSES] = 0x412e, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; static u64 intel_pmu_event_map(int event) @@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x0076, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_CACHE_MISSES] = 0x0081, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; static u64 amd_pmu_event_map(int event) @@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6f9df2babe4..5c6d816f30b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1142,11 +1142,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } -- cgit v1.2.3 From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 572fb434a66..895c82e7845 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ @@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ - [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ }, }, [ C(DTLB) ] = { @@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0, [ C(RESULT_MISS) ] = 0, -- cgit v1.2.3