From 1830b52d0de8c60c4f5dfbac134aa8f69d815801 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 7 Feb 2009 19:38:43 -0500 Subject: trace: remove deprecated entry->cpu Impact: fix to prevent developers from using entry->cpu With the new ring buffer infrastructure, the cpu for the entry is implicit with which CPU buffer it is on. The original code use to record the current cpu into the generic entry header, which can be retrieved by entry->cpu. When the ring buffer was introduced, the users were convert to use the the cpu number of which cpu ring buffer was in use (this was passed to the tracers by the iterator: iter->cpu). Unfortunately, the cpu item in the entry structure was never removed. This allowed for developers to use it instead of the proper iter->cpu, unknowingly, using an uninitialized variable. This was not the fault of the developers, since it would seem like the logical place to retrieve the cpu identifier. This patch removes the cpu item from the entry structure and fixes all the users that should have been using iter->cpu. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 - kernel/trace/trace_hw_branches.c | 3 +-- kernel/trace/trace_output.c | 6 +++--- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd51cf0b94c..bd4d9f8818f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1531,7 +1531,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, iter->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f0c7a0f08ca..5efc4c707f7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,7 +45,6 @@ enum trace_type { */ struct trace_entry { unsigned char type; - unsigned char cpu; unsigned char flags; unsigned char preempt_count; int pid; diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index fff3545fc86..549238a9b13 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -159,7 +159,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) trace_assign_type(it, entry); if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", entry->cpu) && + if (trace_seq_printf(seq, "%4d ", iter->cpu) && seq_print_ip_sym(seq, it->to, symflags) && trace_seq_printf(seq, "\t <- ") && seq_print_ip_sym(seq, it->from, symflags) && @@ -195,7 +195,6 @@ void trace_hw_branch(u64 from, u64 to) entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, from); entry->ent.type = TRACE_HW_BRANCHES; - entry->ent.cpu = cpu; entry->from = from; entry->to = to; ring_buffer_unlock_commit(tr->buffer, event, irq2); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b7380eee9fa..463a310b1d3 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter) unsigned long secs = (unsigned long)t; return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, entry->cpu, secs, usec_rem); + comm, entry->pid, iter->cpu, secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) @@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter) char *comm = trace_find_cmdline(entry->pid); ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, entry->cpu, entry->flags, + entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx, ns2usecs(iter->ts), abs_usecs / USEC_PER_MSEC, @@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter) rel_usecs / USEC_PER_MSEC, rel_usecs % USEC_PER_MSEC); } else { - ret = lat_print_generic(s, entry, entry->cpu); + ret = lat_print_generic(s, entry, iter->cpu); if (ret) ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } -- cgit v1.2.3 From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 18:43:07 -0500 Subject: ring-buffer: add NMI protection for spinlocks Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d..25131a5d5e4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_FTRACE_NMI_ENTER + bool + config HAVE_FUNCTION_TRACER bool @@ -37,6 +40,11 @@ config TRACER_MAX_TRACE config RING_BUFFER bool +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + config TRACING bool select DEBUG_FS diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374cee..a60a6a852f4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -18,6 +19,35 @@ #include "trace.h" +/* + * Since the write to the buffer is still not fully lockless, + * we must be careful with NMIs. The locks in the writers + * are taken when a write crosses to a new page. The locks + * protect against races with the readers (this will soon + * be fixed with a lockless solution). + * + * Because we can not protect against NMIs, and we want to + * keep traces reentrant, we need to manage what happens + * when we are in an NMI. + */ +static DEFINE_PER_CPU(int, rb_in_nmi); + +void ftrace_nmi_enter(void) +{ + __get_cpu_var(rb_in_nmi)++; + /* call arch specific handler too */ + arch_ftrace_nmi_enter(); +} + +void ftrace_nmi_exit(void) +{ + arch_ftrace_nmi_exit(); + __get_cpu_var(rb_in_nmi)--; + /* NMIs are not recursive */ + WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); +} + + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + bool lock_taken = false; commit_page = cpu_buffer->commit_page; /* we just need to protect against interrupts */ @@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page = tail_page; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) + goto out_unlock; + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; rb_inc_page(cpu_buffer, &next_page); @@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); - __raw_spin_unlock(&cpu_buffer->lock); + if (likely(lock_taken)) + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; } -- cgit v1.2.3 From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 01:45:16 -0500 Subject: ring-buffer: use generic version of in_nmi Impact: clean up Now that a generic in_nmi is available, this patch removes the special code in the ring_buffer and implements the in_nmi generic version instead. With this change, I was also able to rename the "arch_ftrace_nmi_enter" back to "ftrace_nmi_enter" and remove the code from the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a60a6a852f4..5ee344417cd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -19,35 +20,6 @@ #include "trace.h" -/* - * Since the write to the buffer is still not fully lockless, - * we must be careful with NMIs. The locks in the writers - * are taken when a write crosses to a new page. The locks - * protect against races with the readers (this will soon - * be fixed with a lockless solution). - * - * Because we can not protect against NMIs, and we want to - * keep traces reentrant, we need to manage what happens - * when we are in an NMI. - */ -static DEFINE_PER_CPU(int, rb_in_nmi); - -void ftrace_nmi_enter(void) -{ - __get_cpu_var(rb_in_nmi)++; - /* call arch specific handler too */ - arch_ftrace_nmi_enter(); -} - -void ftrace_nmi_exit(void) -{ - arch_ftrace_nmi_exit(); - __get_cpu_var(rb_in_nmi)--; - /* NMIs are not recursive */ - WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); -} - - /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_irq_save(flags); /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * * NMIs can happen after we take the lock. * If we are in an NMI, only take the lock * if it is not already taken. Otherwise * simply fail. */ - if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (unlikely(in_nmi())) { if (!__raw_spin_trylock(&cpu_buffer->lock)) goto out_unlock; } else -- cgit v1.2.3 From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Fri, 6 Feb 2009 17:33:27 +0800 Subject: trace: trivial fixes in comment typos. Impact: clean up Fixed several typos in the comments. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 68610031780..1796e018fbf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * it is not enabled then do nothing. * * If this record is not to be traced and - * it is enabled then disabled it. + * it is enabled then disable it. * */ if (rec->flags & FTRACE_FL_NOTRACE) { @@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) return 0; - /* Record is not filtered and is not enabled do nothing */ + /* Record is not filtered or enabled, do nothing */ if (!fl) return 0; @@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) } else { - /* if record is not enabled do nothing */ + /* if record is not enabled, do nothing */ if (!(rec->flags & FTRACE_FL_ENABLED)) return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5efc4c707f7..f92aba52a89 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -616,12 +616,12 @@ extern struct tracer nop_trace; * preempt_enable (after a disable), a schedule might take place * causing an infinite recursion. * - * To prevent this, we read the need_recshed flag before + * To prevent this, we read the need_resched flag before * disabling preemption. When we want to enable preemption we * check the flag, if it is set, then we call preempt_enable_no_resched. * Otherwise, we call preempt_enable. * - * The rational for doing the above is that if need resched is set + * The rational for doing the above is that if need_resched is set * and we have yet to reschedule, we are either in an atomic location * (where we do not need to check for scheduling) or we are inside * the scheduler and do not want to resched. @@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void) * * This is a scheduler safe way to enable preemption and not miss * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we were either inside an atomic or + * If resched is set, then we are either inside an atomic or * are inside the scheduler (we would have already scheduled * otherwise). In this case, we do not want to call normal * preempt_enable, but preempt_enable_no_resched instead. -- cgit v1.2.3