diff options
Diffstat (limited to 'arch/x86/kernel/cpu/perfctr-watchdog.c')
-rw-r--r-- | arch/x86/kernel/cpu/perfctr-watchdog.c | 306 |
1 files changed, 199 insertions, 107 deletions
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f9ae93adffe..6bff382094f 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -1,11 +1,15 @@ -/* local apic based NMI watchdog for various CPUs. - This file also handles reservation of performance counters for coordination - with other users (like oprofile). - - Note that these events normally don't tick when the CPU idles. This means - the frequency varies with CPU load. - - Original code for K7/P6 written by Keith Owens */ +/* + * local apic based NMI watchdog for various CPUs. + * + * This file also handles reservation of performance counters for coordination + * with other users (like oprofile). + * + * Note that these events normally don't tick when the CPU idles. This means + * the frequency varies with CPU load. + * + * Original code for K7/P6 written by Keith Owens + * + */ #include <linux/percpu.h> #include <linux/module.h> @@ -36,12 +40,16 @@ struct wd_ops { static const struct wd_ops *wd_ops; -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) +/* + * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. + * + * It will be the max for all platforms (for now) */ #define NMI_MAX_COUNTER_BITS 66 -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: +/* + * perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection * - different performance counters/ event selection may be reserved for * different subsystems this reservation system just tries to coordinate @@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) return 0; } -/* converts an msr to an appropriate reservation bit */ -/* returns the bit offset of the event selection register */ +/* + * converts an msr to an appropriate reservation bit + * returns the bit offset of the event selection register + */ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) { /* returns the bit offset of the event selection register */ @@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) return (!test_bit(counter, perfctr_nmi_owner)); } +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) { @@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr) return 1; return 0; } +EXPORT_SYMBOL(reserve_perfctr_nmi); void release_perfctr_nmi(unsigned int msr) { @@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr) clear_bit(counter, perfctr_nmi_owner); } +EXPORT_SYMBOL(release_perfctr_nmi); int reserve_evntsel_nmi(unsigned int msr) { @@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr) return 1; return 0; } +EXPORT_SYMBOL(reserve_evntsel_nmi); void release_evntsel_nmi(unsigned int msr) { @@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } - -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); void disable_lapic_nmi_watchdog(void) @@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void) if (atomic_read(&nmi_active) <= 0) return; - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - wd_ops->unreserve(); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); + + if (wd_ops) + wd_ops->unreserve(); BUG_ON(atomic_read(&nmi_active) != 0); } @@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void) return; } - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); touch_nmi_watchdog(); } @@ -232,31 +243,32 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) return retval; } -static void -write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) +static void write_watchdog_counter(unsigned int perfctr_msr, + const char *descr, unsigned nmi_hz) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) + const char *descr, unsigned nmi_hz) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } -/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface - nicely stable so there is not much variety */ - +/* + * AMD K7/K8/Family10h/Family11h support. + * AMD keeps this interface nicely stable so there is not much variety + */ #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -283,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + + /* initialize the wd struct before enabling */ + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused return 1; } @@ -325,18 +343,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) } static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL<<47, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_k7_watchdog, + .rearm = single_msr_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_K7_PERFCTR0, + .evntsel = MSR_K7_EVNTSEL0, + .checkbit = 1ULL << 47, }; -/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ - +/* + * Intel Model 6 (PPro+,P2,P3,P-M,Core1) + */ #define P6_EVNTSEL0_ENABLE (1 << 22) #define P6_EVNTSEL_INT (1 << 20) #define P6_EVNTSEL_OS (1 << 17) @@ -366,58 +385,91 @@ static int setup_p6_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + + /* initialize the wd struct before enabling */ + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused return 1; } static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { - /* P6 based Pentium M need to re-unmask + /* + * P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant. - * ArchPerfom/Core Duo also needs this */ + * ArchPerfom/Core Duo also needs this + */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); } static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL<<39, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_p6_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_P6_PERFCTR0, + .evntsel = MSR_P6_EVNTSEL0, + .checkbit = 1ULL << 39, }; -/* Intel P4 performance counters. By far the most complicated of all. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) - -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - +/* + * Intel P4 performance counters. + * By far the most complicated of all. + */ +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) +#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) +#define P4_ESCR_OS (1 << 3) +#define P4_ESCR_USR (1 << 2) +#define P4_CCCR_OVF_PMI0 (1 << 26) +#define P4_CCCR_OVF_PMI1 (1 << 27) +#define P4_CCCR_THRESHOLD(N) ((N) << 20) +#define P4_CCCR_COMPLEMENT (1 << 19) +#define P4_CCCR_COMPARE (1 << 18) +#define P4_CCCR_REQUIRED (3 << 16) +#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) +#define P4_CCCR_ENABLE (1 << 12) +#define P4_CCCR_OVF (1 << 31) + +#define P4_CONTROLS 18 +static unsigned int p4_controls[18] = { + MSR_P4_BPU_CCCR0, + MSR_P4_BPU_CCCR1, + MSR_P4_BPU_CCCR2, + MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR0, + MSR_P4_MS_CCCR1, + MSR_P4_MS_CCCR2, + MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR0, + MSR_P4_FLAME_CCCR1, + MSR_P4_FLAME_CCCR2, + MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, + MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, + MSR_P4_IQ_CCCR3, + MSR_P4_IQ_CCCR4, + MSR_P4_IQ_CCCR5, +}; +/* + * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + * CRU_ESCR0 (with any non-null event selector) through a complemented + * max threshold. [IA32-Vol3, Section 14.9.9] + */ static int setup_p4_watchdog(unsigned nmi_hz) { unsigned int perfctr_msr, evntsel_msr, cccr_msr; @@ -442,7 +494,8 @@ static int setup_p4_watchdog(unsigned nmi_hz) #endif ht_num = 0; - /* performance counters are shared resources + /* + * performance counters are shared resources * assign each hyperthread its own set * (re-use the ESCR0 register, seems safe * and keeps the cccr_val the same) @@ -453,12 +506,38 @@ static int setup_p4_watchdog(unsigned nmi_hz) evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR0; cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + + /* + * If we're on the kdump kernel or other situation, we may + * still have other performance counter registers set to + * interrupt and they'll keep interrupting forever because + * of the P4_CCCR_OVF quirk. So we need to ACK all the + * pending interrupts and disable all the registers here, + * before reenabling the NMI delivery. Refer to p4_rearm() + * about the P4_CCCR_OVF quirk. + */ + if (reset_devices) { + unsigned int low, high; + int i; + + for (i = 0; i < P4_CONTROLS; i++) { + rdmsr(p4_controls[i], low, high); + low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); + wrmsr(p4_controls[i], low, high); + } + } } else { /* logical cpu 1 */ perfctr_msr = MSR_P4_IQ_PERFCTR1; evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + + /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ + if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) + cccr_val = P4_CCCR_OVF_PMI0; + else + cccr_val = P4_CCCR_OVF_PMI1; + cccr_val |= P4_CCCR_ESCR_SELECT(4); } evntsel = P4_ESCR_EVENT_SELECT(0x3F) @@ -473,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); wrmsr(cccr_msr, cccr_val, 0); write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = cccr_msr; + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); return 1; } @@ -540,20 +624,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) } static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, + .reserve = p4_reserve, + .unreserve = p4_unreserve, + .setup = setup_p4_watchdog, + .rearm = p4_rearm, + .stop = stop_p4_watchdog, /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL<<39, + .perfctr = MSR_P4_BPU_PERFCTR0, + .evntsel = MSR_P4_BSU_ESCR0, + .checkbit = 1ULL << 39, }; -/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully - all future Intel CPUs. */ - +/* + * Watchdog using the Intel architected PerfMon. + * Used for Core2 and hopefully all future Intel CPUs. + */ #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK @@ -593,25 +678,29 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused + wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_intel_arch_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_ARCH_PERFMON_PERFCTR1, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; static void probe_nmi_watchdog(void) @@ -624,8 +713,10 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* Work around Core Duo (Yonah) errata AE49 where perfctr1 - doesn't have a working enable bit. */ + /* + * Work around Core Duo (Yonah) errata AE49 where perfctr1 + * doesn't have a working enable bit. + */ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; @@ -636,7 +727,7 @@ static void probe_nmi_watchdog(void) } switch (boot_cpu_data.x86) { case 6: - if (boot_cpu_data.x86_model > 0xd) + if (boot_cpu_data.x86_model > 13) return; wd_ops = &p6_wd_ops; @@ -697,10 +788,11 @@ int lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; + rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) { /* perfctr still running? */ + if (ctr & wd_ops->checkbit) /* perfctr still running? */ return 0; - } + wd_ops->rearm(wd, nmi_hz); return 1; } |