aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/perfctr-watchdog.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/perfctr-watchdog.c')
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c306
1 files changed, 199 insertions, 107 deletions
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe..6bff382094f 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,11 +1,15 @@
-/* local apic based NMI watchdog for various CPUs.
- This file also handles reservation of performance counters for coordination
- with other users (like oprofile).
-
- Note that these events normally don't tick when the CPU idles. This means
- the frequency varies with CPU load.
-
- Original code for K7/P6 written by Keith Owens */
+/*
+ * local apic based NMI watchdog for various CPUs.
+ *
+ * This file also handles reservation of performance counters for coordination
+ * with other users (like oprofile).
+ *
+ * Note that these events normally don't tick when the CPU idles. This means
+ * the frequency varies with CPU load.
+ *
+ * Original code for K7/P6 written by Keith Owens
+ *
+ */
#include <linux/percpu.h>
#include <linux/module.h>
@@ -36,12 +40,16 @@ struct wd_ops {
static const struct wd_ops *wd_ops;
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
+/*
+ * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.
+ *
+ * It will be the max for all platforms (for now)
*/
#define NMI_MAX_COUNTER_BITS 66
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+/*
+ * perfctr_nmi_owner tracks the ownership of the perfctr registers:
* evtsel_nmi_owner tracks the ownership of the event selection
* - different performance counters/ event selection may be reserved for
* different subsystems this reservation system just tries to coordinate
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
return 0;
}
-/* converts an msr to an appropriate reservation bit */
-/* returns the bit offset of the event selection register */
+/*
+ * converts an msr to an appropriate reservation bit
+ * returns the bit offset of the event selection register
+ */
static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
return (!test_bit(counter, perfctr_nmi_owner));
}
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
{
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr)
return 1;
return 0;
}
+EXPORT_SYMBOL(reserve_perfctr_nmi);
void release_perfctr_nmi(unsigned int msr)
{
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr)
clear_bit(counter, perfctr_nmi_owner);
}
+EXPORT_SYMBOL(release_perfctr_nmi);
int reserve_evntsel_nmi(unsigned int msr)
{
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr)
return 1;
return 0;
}
+EXPORT_SYMBOL(reserve_evntsel_nmi);
void release_evntsel_nmi(unsigned int msr)
{
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr)
clear_bit(counter, evntsel_nmi_owner);
}
-
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
EXPORT_SYMBOL(release_evntsel_nmi);
void disable_lapic_nmi_watchdog(void)
@@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void)
if (atomic_read(&nmi_active) <= 0)
return;
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
- wd_ops->unreserve();
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
+
+ if (wd_ops)
+ wd_ops->unreserve();
BUG_ON(atomic_read(&nmi_active) != 0);
}
@@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void)
return;
}
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
touch_nmi_watchdog();
}
@@ -232,31 +243,32 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
return retval;
}
-static void
-write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+static void write_watchdog_counter(unsigned int perfctr_msr,
+ const char *descr, unsigned nmi_hz)
{
u64 count = (u64)cpu_khz * 1000;
do_div(count, nmi_hz);
if(descr)
- Dprintk("setting %s to -0x%08Lx\n", descr, count);
+ pr_debug("setting %s to -0x%08Lx\n", descr, count);
wrmsrl(perfctr_msr, 0 - count);
}
static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
+ const char *descr, unsigned nmi_hz)
{
u64 count = (u64)cpu_khz * 1000;
do_div(count, nmi_hz);
if(descr)
- Dprintk("setting %s to -0x%08Lx\n", descr, count);
+ pr_debug("setting %s to -0x%08Lx\n", descr, count);
wrmsr(perfctr_msr, (u32)(-count), 0);
}
-/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
- nicely stable so there is not much variety */
-
+/*
+ * AMD K7/K8/Family10h/Family11h support.
+ * AMD keeps this interface nicely stable so there is not much variety
+ */
#define K7_EVNTSEL_ENABLE (1 << 22)
#define K7_EVNTSEL_INT (1 << 20)
#define K7_EVNTSEL_OS (1 << 17)
@@ -283,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
+
+ /* initialize the wd struct before enabling */
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
return 1;
}
@@ -325,18 +343,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
}
static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL<<47,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_k7_watchdog,
+ .rearm = single_msr_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_K7_PERFCTR0,
+ .evntsel = MSR_K7_EVNTSEL0,
+ .checkbit = 1ULL << 47,
};
-/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
-
+/*
+ * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
+ */
#define P6_EVNTSEL0_ENABLE (1 << 22)
#define P6_EVNTSEL_INT (1 << 20)
#define P6_EVNTSEL_OS (1 << 17)
@@ -366,58 +385,91 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
+
+ /* initialize the wd struct before enabling */
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
return 1;
}
static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
- /* P6 based Pentium M need to re-unmask
+ /*
+ * P6 based Pentium M need to re-unmask
* the apic vector but it doesn't hurt
* other P6 variant.
- * ArchPerfom/Core Duo also needs this */
+ * ArchPerfom/Core Duo also needs this
+ */
apic_write(APIC_LVTPC, APIC_DM_NMI);
+
/* P6/ARCH_PERFMON has 32 bit counter write */
write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
}
static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL<<39,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_p6_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_P6_PERFCTR0,
+ .evntsel = MSR_P6_EVNTSEL0,
+ .checkbit = 1ULL << 39,
};
-/* Intel P4 performance counters. By far the most complicated of all. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-#define P4_CCCR_OVF (1<<31)
-
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-
+/*
+ * Intel P4 performance counters.
+ * By far the most complicated of all.
+ */
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
+#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
+#define P4_ESCR_OS (1 << 3)
+#define P4_ESCR_USR (1 << 2)
+#define P4_CCCR_OVF_PMI0 (1 << 26)
+#define P4_CCCR_OVF_PMI1 (1 << 27)
+#define P4_CCCR_THRESHOLD(N) ((N) << 20)
+#define P4_CCCR_COMPLEMENT (1 << 19)
+#define P4_CCCR_COMPARE (1 << 18)
+#define P4_CCCR_REQUIRED (3 << 16)
+#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
+#define P4_CCCR_ENABLE (1 << 12)
+#define P4_CCCR_OVF (1 << 31)
+
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+ MSR_P4_BPU_CCCR0,
+ MSR_P4_BPU_CCCR1,
+ MSR_P4_BPU_CCCR2,
+ MSR_P4_BPU_CCCR3,
+ MSR_P4_MS_CCCR0,
+ MSR_P4_MS_CCCR1,
+ MSR_P4_MS_CCCR2,
+ MSR_P4_MS_CCCR3,
+ MSR_P4_FLAME_CCCR0,
+ MSR_P4_FLAME_CCCR1,
+ MSR_P4_FLAME_CCCR2,
+ MSR_P4_FLAME_CCCR3,
+ MSR_P4_IQ_CCCR0,
+ MSR_P4_IQ_CCCR1,
+ MSR_P4_IQ_CCCR2,
+ MSR_P4_IQ_CCCR3,
+ MSR_P4_IQ_CCCR4,
+ MSR_P4_IQ_CCCR5,
+};
+/*
+ * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ * CRU_ESCR0 (with any non-null event selector) through a complemented
+ * max threshold. [IA32-Vol3, Section 14.9.9]
+ */
static int setup_p4_watchdog(unsigned nmi_hz)
{
unsigned int perfctr_msr, evntsel_msr, cccr_msr;
@@ -442,7 +494,8 @@ static int setup_p4_watchdog(unsigned nmi_hz)
#endif
ht_num = 0;
- /* performance counters are shared resources
+ /*
+ * performance counters are shared resources
* assign each hyperthread its own set
* (re-use the ESCR0 register, seems safe
* and keeps the cccr_val the same)
@@ -453,12 +506,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+ /*
+ * If we're on the kdump kernel or other situation, we may
+ * still have other performance counter registers set to
+ * interrupt and they'll keep interrupting forever because
+ * of the P4_CCCR_OVF quirk. So we need to ACK all the
+ * pending interrupts and disable all the registers here,
+ * before reenabling the NMI delivery. Refer to p4_rearm()
+ * about the P4_CCCR_OVF quirk.
+ */
+ if (reset_devices) {
+ unsigned int low, high;
+ int i;
+
+ for (i = 0; i < P4_CONTROLS; i++) {
+ rdmsr(p4_controls[i], low, high);
+ low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+ wrmsr(p4_controls[i], low, high);
+ }
+ }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR1;
- cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+
+ /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
+ if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
+ cccr_val = P4_CCCR_OVF_PMI0;
+ else
+ cccr_val = P4_CCCR_OVF_PMI1;
+ cccr_val |= P4_CCCR_ESCR_SELECT(4);
}
evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -473,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
+
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = cccr_msr;
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
return 1;
}
@@ -540,20 +624,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
}
static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
+ .reserve = p4_reserve,
+ .unreserve = p4_unreserve,
+ .setup = setup_p4_watchdog,
+ .rearm = p4_rearm,
+ .stop = stop_p4_watchdog,
/* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL<<39,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .evntsel = MSR_P4_BSU_ESCR0,
+ .checkbit = 1ULL << 39,
};
-/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
- all future Intel CPUs. */
-
+/*
+ * Watchdog using the Intel architected PerfMon.
+ * Used for Core2 and hopefully all future Intel CPUs.
+ */
#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
@@ -593,25 +678,29 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
+ wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_intel_arch_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+ .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
};
static void probe_nmi_watchdog(void)
@@ -624,8 +713,10 @@ static void probe_nmi_watchdog(void)
wd_ops = &k7_wd_ops;
break;
case X86_VENDOR_INTEL:
- /* Work around Core Duo (Yonah) errata AE49 where perfctr1
- doesn't have a working enable bit. */
+ /*
+ * Work around Core Duo (Yonah) errata AE49 where perfctr1
+ * doesn't have a working enable bit.
+ */
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
@@ -636,7 +727,7 @@ static void probe_nmi_watchdog(void)
}
switch (boot_cpu_data.x86) {
case 6:
- if (boot_cpu_data.x86_model > 0xd)
+ if (boot_cpu_data.x86_model > 13)
return;
wd_ops = &p6_wd_ops;
@@ -697,10 +788,11 @@ int lapic_wd_event(unsigned nmi_hz)
{
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
u64 ctr;
+
rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+ if (ctr & wd_ops->checkbit) /* perfctr still running? */
return 0;
- }
+
wd_ops->rearm(wd, nmi_hz);
return 1;
}