From b4b8f87bf4958cbad620654efc0882ac46c19846 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:08 +0200 Subject: i386, dumpstack: move crash_kexec before bust_spinlocks(0) in oops_end crash_kexec should not be called with console_sem held. Move the call before bust_spinlocks(0) in oops_end to avoid the problem. Signed-off-by: Alexander van Heukelum Acked-by: "Neil Horman" Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..5493d31be4e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -309,6 +309,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); @@ -318,8 +321,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (!regs) return; - if (kexec_should_crash(current)) - crash_kexec(regs); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3 From 874d93d11823b2b861addac6a5dc31162e924ab2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:09 +0200 Subject: x86, dumpstack: let signr=0 signal no do_exit Change oops_end such that signr=0 signals that do_exit is not to be called. Currently, each use of __die is soon followed by a call to oops_end and 'regs' is set to NULL if oops_end is expected not to call do_exit. Change all such pairs to set signr=0 instead. On x86_64 oops_end is used 'bare' in die_nmi; use signr=0 instead of regs=NULL there, too. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 7 ++++--- arch/x86/kernel/dumpstack_64.c | 9 +++++---- arch/x86/mm/fault.c | 11 +++++++---- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5493d31be4e..7c22f99f0ef 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,7 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) + if (!signr) return; if (in_interrupt()) @@ -371,17 +371,18 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; + sig = 0; } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - oops_end(flags, regs, SIGSEGV); + oops_end(flags, regs, sig); } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..ffefea611ba 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -465,7 +465,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) { + if (!signr) { oops_exit(); return; } @@ -509,13 +509,14 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); + sig = 0; + oops_end(flags, regs, sig); } notrace __kprobes void @@ -539,7 +540,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); + oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..20ef272c412 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif tsk = current; @@ -849,11 +851,12 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif /* -- cgit v1.2.3 From 0ed7a498f416dcfa1cca478a559238a2a3396240 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:10 +0200 Subject: x86_64, dumpstack: move kexec_crash from __die to oops_end oops_end is preceded by either a call to __die, or a conditional call to crash_kexec. Move the conditional call to crash_kexec from the end of __die to the start of oops_end and remove the superfluous call to crash_kexec in die_nmi. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ffefea611ba..57ce11b895c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -458,6 +458,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + die_owner = -1; bust_spinlocks(0); die_nest_count--; @@ -501,8 +504,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); return 0; } @@ -536,11 +537,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); + oops_end(flags, regs, 0); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); -- cgit v1.2.3 From 10b14cb7eb7dd5bff8023f76a55c8ac20e586128 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:11 +0200 Subject: x86, dumpstack: always call oops_exit from oops_end Always call oops_exit from oops_end, even if signr==0. Also, move add_taint(TAINT_DIE) from __die to oops_end on x86_64 and interchange two lines to make oops_end more similar to the i386-version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c22f99f0ef..a29b88ffa34 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,6 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + oops_exit(); if (!signr) return; @@ -325,7 +326,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57ce11b895c..dc6162bf745 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -461,22 +461,22 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (regs && kexec_should_crash(current)) crash_kexec(regs); - die_owner = -1; bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!signr) { - oops_exit(); + oops_exit(); + + if (!signr) return; - } if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } @@ -499,7 +499,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 1; show_registers(regs); - add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); -- cgit v1.2.3 From e4955cfd2f5c81eb708f55769aa60173f207fd63 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:12 +0200 Subject: i386, dumpstack: use x86_64's method to account die_nest_count oops_begin/oops_end should always be used in pairs. On x86_64 oops_begin increments die_nest_count, and oops_end decrements die_nest_count. Doing this makes oops_begin and oops_end equal to the x86_64 versions. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index a29b88ffa34..7c7d691b32b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -289,21 +289,24 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { + int cpu; unsigned long flags; oops_enter(); - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); } die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); return flags; } @@ -315,13 +318,15 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - oops_exit(); + if (!signr) return; - if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3 From e06ca430c3d0fddbd1c901ab3fb3e1f0bc8a786b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:13 +0200 Subject: i386, dumpstack: use oops_begin/oops_end in die_nmi Use oops_begin and oops_end in die_nmi. Whitespace-only changes on x86_64, to make it equal to i386's version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 33 +++++++++++---------------------- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c7d691b32b..e91ae34f968 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -390,40 +390,29 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static DEFINE_SPINLOCK(nmi_print_lock); - void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { + unsigned long flags; + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - spin_lock(&nmi_print_lock); /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (do_panic) + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); } static int __init oops_setup(char *s) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index dc6162bf745..831e1e159cb 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -519,7 +519,7 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -notrace __kprobes void +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; @@ -527,11 +527,11 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - flags = oops_begin(); /* * We are in trouble anyway, lets at least try * to get a message out. */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); -- cgit v1.2.3 From 871d3779cba18b028e34d0d2f6cc6caae76a97b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:14 +0200 Subject: i386, dumpstack: unify die() Make i386's die() equal to x86_64's version. Whitespace-only changes on x86_64, to make it equal to i386's version. (user_mode and user_mode_vm are equal on x86_64.) Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 10 +++------- arch/x86/kernel/dumpstack_64.c | 6 +++++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e91ae34f968..f2046c5752d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -378,15 +378,11 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (die_nest_count < 3) { + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); - if (__die(str, regs, err)) - sig = 0; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - + if (__die(str, regs, err)) + sig = 0; oops_end(flags, regs, sig); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 831e1e159cb..28c67aae556 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -506,12 +506,16 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 0; } +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) -- cgit v1.2.3 From 878719e831d9e076961aa15d4049a57a6668c67a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 23 Oct 2008 10:40:06 -0400 Subject: x86: unify appropriate bits from dumpstack_32 and dumpstack_64 Impact: cleanup As promised, now that dumpstack_32 and dumpstack_64 have so many bits in common, we should merge the in-sync bits into a common file, to prevent them from diverging again. This patch removes bits which are common between dumpstack_32.c and dumpstack_64.c and places them in a common dumpstack.c which is built for both 32 and 64 bit arches. Signed-off-by: Neil Horman Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/Makefile | 2 arch/x86/kernel/dumpstack.c | 319 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/dumpstack.h | 39 +++++ arch/x86/kernel/dumpstack_32.c | 294 ------------------------------------- arch/x86/kernel/dumpstack_64.c | 285 ------------------------------------ 5 files changed, 363 insertions(+), 576 deletions(-) --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/dumpstack.c | 319 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/dumpstack.h | 39 +++++ arch/x86/kernel/dumpstack_32.c | 294 +------------------------------------ arch/x86/kernel/dumpstack_64.c | 285 +----------------------------------- 5 files changed, 363 insertions(+), 576 deletions(-) create mode 100644 arch/x86/kernel/dumpstack.c create mode 100644 arch/x86/kernel/dumpstack.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22..db3216a9d2b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,7 +24,7 @@ CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..5962176dfab --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,319 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); +#ifdef CONFIG_X86_32 + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 00000000000..3119a801c32 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2046c5752d..7b031b106ec 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,64 +17,7 @@ #include -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} +#include "dumpstack.h" void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, @@ -119,57 +62,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -196,33 +89,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { @@ -283,159 +149,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - if (regs && kexec_should_crash(current)) - crash_kexec(regs); - - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - oops_exit(); - - if (!signr) - return; - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - int sig = SIGSEGV; - - if (!user_mode_vm(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - sig = 0; - oops_end(flags, regs, sig); -} - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 28c67aae556..33ff10287a5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@ #include -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h" static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -248,57 +191,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -342,33 +235,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; @@ -429,150 +295,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - if (regs && kexec_should_crash(current)) - crash_kexec(regs); - - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - oops_exit(); - - if (!signr) - return; - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - int sig = SIGSEGV; - - if (!user_mode_vm(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - sig = 0; - oops_end(flags, regs, sig); -} - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.3 From a5e25883a445dce94a087ca479b21a5959cd5c18 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:05 -0500 Subject: ftrace: replace raw_local_irq_save with local_irq_save Impact: fix for lockdep and ftrace The raw_local_irq_save/restore confuses lockdep. This patch converts them to the local_irq_save/restore variants. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 1 + kernel/trace/trace.c | 12 ++++++------ kernel/trace/trace_branch.c | 4 ++-- kernel/trace/trace_stack.c | 8 ++++---- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 46a404173db..74b1878b8bb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -25,6 +25,7 @@ * Thanks to Arjan van de Ven for coming up with the initial idea of * mapping lock dependencies runtime. */ +#define DISABLE_BRANCH_PROFILING #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91887a280ab..380de630ebc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1209,7 +1209,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -1218,7 +1218,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) __trace_graph_entry(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -1230,7 +1230,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -1239,7 +1239,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) __trace_graph_return(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -2645,7 +2645,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - raw_local_irq_disable(); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* @@ -2662,7 +2662,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, } } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_enable(); + local_irq_enable(); tracing_cpumask = tracing_cpumask_new; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index bc972753568..6c00feb3bac 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -42,7 +42,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; @@ -74,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) out: atomic_dec(&tr->data[cpu]->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static inline diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index fde3be15c64..06a16115be0 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -48,7 +48,7 @@ static inline void check_stack(void) if (!object_is_on_stack(&this_size)) return; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); /* a race could have already updated it */ @@ -96,7 +96,7 @@ static inline void check_stack(void) out: __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static void @@ -162,11 +162,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); *ptr = val; __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); return count; } -- cgit v1.2.3 From abc9b56d66fbd4d93302ef4bf6fa726e1b8255f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:06 -0500 Subject: ring-buffer: move some metadata into buffer page Impact: get ready for splice changes This patch moves the commit and timestamp into the beginning of each data page of the buffer. This change will allow the page to be moved to another location (disk, network, etc) and still have information in the page to be able to read it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 63 ++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e206951603c..8619c534588 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -195,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/* - * This hack stolen from mm/slob.c. - * We can store per page timing information in the page frame of the page. - * Thanks to Peter Zijlstra for suggesting this idea. - */ -struct buffer_page { +struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t write; /* index for next write */ local_t commit; /* write commited index */ + unsigned char data[]; /* data of buffer page */ +}; + +struct buffer_page { + local_t write; /* index for next write */ unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ - void *page; /* Actual data page */ + struct buffer_data_page *page; /* Actual data page */ }; +static void rb_init_page(struct buffer_data_page *page) +{ + local_set(&page->commit, 0); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -230,7 +234,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE PAGE_SIZE +#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) /* * head_page == tail_page && head == tail then buffer is empty. @@ -333,6 +337,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (!addr) goto free_pages; page->page = (void *)addr; + rb_init_page(page->page); } list_splice(&pages, head); @@ -378,6 +383,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!addr) goto fail_free_reader; page->page = (void *)addr; + rb_init_page(page->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -647,6 +653,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (!addr) goto free_pages; page->page = (void *)addr; + rb_init_page(page->page); } } @@ -682,7 +689,7 @@ static inline int rb_null_event(struct ring_buffer_event *event) static inline void *__rb_page_index(struct buffer_page *page, unsigned index) { - return page->page + index; + return page->page->data + index; } static inline struct ring_buffer_event * @@ -712,7 +719,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage) static inline unsigned rb_page_commit(struct buffer_page *bpage) { - return local_read(&bpage->commit); + return local_read(&bpage->page->commit); } /* Size is determined by what has been commited */ @@ -804,14 +811,15 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, cpu_buffer->commit_page == cpu_buffer->tail_page)) return; - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; } /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->commit, index); + local_set(&cpu_buffer->commit_page->page->commit, index); } static inline void @@ -826,16 +834,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; barrier(); } @@ -843,7 +852,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; cpu_buffer->reader_page->read = 0; } @@ -862,7 +871,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; } @@ -998,12 +1007,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); - local_set(&next_page->commit, 0); + local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; /* reread the time stamp */ *ts = ring_buffer_time_stamp(cpu_buffer->cpu); - cpu_buffer->tail_page->time_stamp = *ts; + cpu_buffer->tail_page->page->time_stamp = *ts; } /* @@ -1048,7 +1057,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * this page's time stamp. */ if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; return event; @@ -1099,7 +1108,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; event->time_delta = 0; event->array[0] = 0; } @@ -1552,7 +1561,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; } /** @@ -1696,7 +1705,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ reader->list.prev->next = &cpu_buffer->reader_page->list; @@ -2088,7 +2097,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->commit, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2097,7 +2106,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; -- cgit v1.2.3 From 8789a9e7df6bf9b93739c4c7d4e380725bc9e936 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:07 -0500 Subject: ring-buffer: read page interface Impact: new API to ring buffer This patch adds a new interface into the ring buffer that allows a page to be read from the ring buffer on a given CPU. For every page read, one must also be given to allow for a "swap" of the pages. rpage = ring_buffer_alloc_read_page(buffer); if (!rpage) goto err; ret = ring_buffer_read_page(buffer, &rpage, cpu, full); if (!ret) goto empty; process_page(rpage); ring_buffer_free_read_page(rpage); The caller of these functions must handle any waits that are needed to wait for new data. The ring_buffer_read_page will simply return 0 if there is no data, or if "full" is set and the writer is still on the current page. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 5 ++ kernel/trace/ring_buffer.c | 166 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3bb87a753fa..1a350a847ed 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -124,6 +124,11 @@ void tracing_on(void); void tracing_off(void); void tracing_off_permanent(void); +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8619c534588..50b74d3a5c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -687,6 +687,12 @@ static inline int rb_null_event(struct ring_buffer_event *event) return event->type == RINGBUF_TYPE_PADDING; } +static inline void * +__rb_data_page_index(struct buffer_data_page *page, unsigned index) +{ + return page->data + index; +} + static inline void *__rb_page_index(struct buffer_page *page, unsigned index) { return page->page->data + index; @@ -2232,6 +2238,166 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_data_page *page) +{ + struct ring_buffer_event *event; + unsigned long head; + + __raw_spin_lock(&cpu_buffer->lock); + for (head = 0; head < local_read(&page->commit); + head += rb_event_length(event)) { + + event = __rb_data_page_index(page, head); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->entries--; + } + __raw_spin_unlock(&cpu_buffer->lock); +} + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +{ + unsigned long addr; + struct buffer_data_page *page; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return NULL; + + page = (void *)addr; + + return page; +} + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_page(buffer); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * if (ret) + * process_page(rpage); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * 1 if data has been transferred + * 0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *page; + unsigned long flags; + int ret = 0; + + if (!data_page) + return 0; + + page = *data_page; + if (!page) + return 0; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + /* + * rb_buffer_peek will get the next ring buffer if + * the current reader page is empty. + */ + event = rb_buffer_peek(buffer, cpu, NULL); + if (!event) + goto out; + + /* check for data */ + if (!local_read(&cpu_buffer->reader_page->page->commit)) + goto out; + /* + * If the writer is already off of the read page, then simply + * switch the read page with the given page. Otherwise + * we need to copy the data from the reader to the writer. + */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) { + unsigned int read = cpu_buffer->reader_page->read; + + if (full) + goto out; + /* The writer is still on the reader page, we must copy */ + page = cpu_buffer->reader_page->page; + memcpy(page->data, + cpu_buffer->reader_page->page->data + read, + local_read(&page->commit) - read); + + /* consume what was read */ + cpu_buffer->reader_page += read; + + } else { + /* swap the pages */ + rb_init_page(page); + page = cpu_buffer->reader_page->page; + cpu_buffer->reader_page->page = *data_page; + cpu_buffer->reader_page->read = 0; + *data_page = page; + } + ret = 1; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, page); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} + static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) -- cgit v1.2.3 From 347fdd9dd4e5d3f3a4e415925c35bdff1d59c3a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:08 -0500 Subject: ftrace: clean up function graph asm Impact: clean up There exists macros for x86 asm to handle x86_64 and i386. This patch updates function graph asm to use them. Signed-off-by: Steven Rostedt Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 58832478b94..1a5b8f8cb3c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,28 +467,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( -#ifdef CONFIG_X86_64 - "1: movq (%[parent_old]), %[old]\n" - "2: movq %[return_hooker], (%[parent_replaced])\n" -#else - "1: movl (%[parent_old]), %[old]\n" - "2: movl %[return_hooker], (%[parent_replaced])\n" -#endif + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" "3: movl $1, %[faulted]\n" ".previous\n" - ".section __ex_table, \"a\"\n" -#ifdef CONFIG_X86_64 - " .quad 1b, 3b\n" - " .quad 2b, 3b\n" -#else - " .long 1b, 3b\n" - " .long 2b, 3b\n" -#endif - ".previous\n" + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) : [parent_replaced] "=r" (parent), [old] "=r" (old), [faulted] "=r" (faulted) -- cgit v1.2.3 From bb4304c71c97bf727ec43cd2f195c2c237c27fd3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:09 -0500 Subject: ftrace: have function graph use mcount caller address Impact: consistency change for function graph This patch makes function graph record the mcount caller address the same way the function tracer does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + arch/x86/kernel/entry_64.S | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 958af86186c..826682abed1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1230,6 +1230,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2aa0526ac30..9060ba6497e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -173,6 +173,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return -- cgit v1.2.3 From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:02 -0500 Subject: ftrace: add ftrace_graph_stop() Impact: new ftrace_graph_stop function While developing more features of function graph, I hit a bug that caused the WARN_ON to trigger in the prepare_ftrace_return function. Well, it was hard for me to find out that was happening because the bug would not print, it would just cause a hard lockup or reboot. The reason is that it is not safe to call printk from this function. Looking further, I also found that it calls unregister_ftrace_graph, which grabs a mutex and calls kstop machine. This would definitely lock the box up if it were to trigger. This patch adds a fast and safe ftrace_graph_stop() which will stop the function tracer. Then it is safe to call the WARN ON. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 10 ++++++---- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 5 +++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1a5b8f8cb3c..adba8e9a427 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,14 +484,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) : "memory" ); - if (WARN_ON(faulted)) { - unregister_ftrace_graph(); + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); return; } - if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_graph(); + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); *parent = old; + WARN_ON(1); return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index afba918c623..58ca1c3a3f4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -376,6 +376,8 @@ typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc); +extern void ftrace_graph_stop(void); + /* The current handlers in use */ extern trace_func_graph_ret_t ftrace_graph_return; extern trace_func_graph_ent_t ftrace_graph_entry; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e78628443e..a44af05ae2d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1769,5 +1769,10 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } + +void ftrace_graph_stop(void) +{ + ftrace_stop(); +} #endif -- cgit v1.2.3 From 044fa782ebb9472cf5253e95d9a625fd4c0bdd99 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:03 -0500 Subject: ring-buffer: change "page" variable names to "bpage" Impact: clean up Andrew Morton pointed out that the kernel convention of a variable named page should be of type page struct. The ring buffer uses a variable named "page" for a pointer to something else. This patch converts those to be called "bpage" (as in "buffer page"). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 130 ++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50b74d3a5c3..7f69cfeaadf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -208,9 +208,9 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; -static void rb_init_page(struct buffer_data_page *page) +static void rb_init_page(struct buffer_data_page *bpage) { - local_set(&page->commit, 0); + local_set(&bpage->commit, 0); } /* @@ -298,19 +298,19 @@ struct ring_buffer_iter { static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; - list_for_each_entry_safe(page, tmp, head, list) { + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, - page->list.next->prev != &page->list)) + bpage->list.next->prev != &bpage->list)) return -1; if (RB_WARN_ON(cpu_buffer, - page->list.prev->next != &page->list)) + bpage->list.prev->next != &bpage->list)) return -1; } @@ -321,23 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; for (i = 0; i < nr_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); } list_splice(&pages, head); @@ -347,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return 0; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } return -ENOMEM; } @@ -358,7 +358,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *page; + struct buffer_page *bpage; unsigned long addr; int ret; @@ -373,17 +373,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto fail_free_buffer; - cpu_buffer->reader_page = page; + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) goto fail_free_reader; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -408,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(page, tmp, head, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } kfree(cpu_buffer); } @@ -512,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -523,9 +523,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) return; p = cpu_buffer->pages.next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - free_buffer_page(page); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + free_buffer_page(bpage); } if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) return; @@ -542,7 +542,7 @@ static void rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *pages, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -553,9 +553,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; p = pages->next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - list_add_tail(&page->list, &cpu_buffer->pages); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + list_add_tail(&bpage->list, &cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -582,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) { struct ring_buffer_per_cpu *cpu_buffer; unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long buffer_size; unsigned long addr; LIST_HEAD(pages); @@ -643,17 +643,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); } } @@ -674,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } mutex_unlock(&buffer->mutex); return -ENOMEM; @@ -688,14 +688,14 @@ static inline int rb_null_event(struct ring_buffer_event *event) } static inline void * -__rb_data_page_index(struct buffer_data_page *page, unsigned index) +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { - return page->data + index; + return bpage->data + index; } -static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { - return page->page->data + index; + return bpage->page->data + index; } static inline struct ring_buffer_event * @@ -771,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) } static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **page) + struct buffer_page **bpage) { - struct list_head *p = (*page)->list.next; + struct list_head *p = (*bpage)->list.next; if (p == &cpu_buffer->pages) p = p->next; - *page = list_entry(p, struct buffer_page, list); + *bpage = list_entry(p, struct buffer_page, list); } static inline unsigned @@ -2239,16 +2239,16 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, } static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *page) + struct buffer_data_page *bpage) { struct ring_buffer_event *event; unsigned long head; __raw_spin_lock(&cpu_buffer->lock); - for (head = 0; head < local_read(&page->commit); + for (head = 0; head < local_read(&bpage->commit); head += rb_event_length(event)) { - event = __rb_data_page_index(page, head); + event = __rb_data_page_index(bpage, head); if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ @@ -2277,15 +2277,15 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { unsigned long addr; - struct buffer_data_page *page; + struct buffer_data_page *bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) return NULL; - page = (void *)addr; + bpage = (void *)addr; - return page; + return bpage; } /** @@ -2337,15 +2337,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; - struct buffer_data_page *page; + struct buffer_data_page *bpage; unsigned long flags; int ret = 0; if (!data_page) return 0; - page = *data_page; - if (!page) + bpage = *data_page; + if (!bpage) return 0; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2372,26 +2372,26 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (full) goto out; /* The writer is still on the reader page, we must copy */ - page = cpu_buffer->reader_page->page; - memcpy(page->data, + bpage = cpu_buffer->reader_page->page; + memcpy(bpage->data, cpu_buffer->reader_page->page->data + read, - local_read(&page->commit) - read); + local_read(&bpage->commit) - read); /* consume what was read */ cpu_buffer->reader_page += read; } else { /* swap the pages */ - rb_init_page(page); - page = cpu_buffer->reader_page->page; + rb_init_page(bpage); + bpage = cpu_buffer->reader_page->page; cpu_buffer->reader_page->page = *data_page; cpu_buffer->reader_page->read = 0; - *data_page = page; + *data_page = bpage; } ret = 1; /* update the entry counter */ - rb_remove_entries(cpu_buffer, page); + rb_remove_entries(cpu_buffer, bpage); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v1.2.3 From 7ee991fbc6f947e9b04f29c9c6c1d057d0671a16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:04 -0500 Subject: ftrace: print real return in dumpstack for function graph Impact: better dumpstack output I noticed in my crash dumps and even in the stack tracer that a lot of functions listed in the stack trace are simply return_to_handler which is ftrace graphs way to insert its own call into the return of a function. But we lose out where the actually function was called from. This patch adds in hooks to the dumpstack mechanism that detects this and finds the real function to print. Both are printed to let the user know that a hook is still in place. This does give a funny side effect in the stack tracer output: Depth Size Location (80 entries) ----- ---- -------- 0) 4144 48 save_stack_trace+0x2f/0x4d 1) 4096 128 ftrace_call+0x5/0x2b 2) 3968 16 mempool_alloc_slab+0x16/0x18 3) 3952 384 return_to_handler+0x0/0x73 4) 3568 -240 stack_trace_call+0x11d/0x209 5) 3808 144 return_to_handler+0x0/0x73 6) 3664 -128 mempool_alloc+0x4d/0xfe 7) 3792 128 return_to_handler+0x0/0x73 8) 3664 -32 scsi_sg_alloc+0x48/0x4a [scsi_mod] As you can see, the real functions are now negative. This is due to them not being found inside the stack. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 34 +++++++++++++++++++++++++++++++++- arch/x86/kernel/dumpstack.h | 2 +- arch/x86/kernel/dumpstack_32.c | 5 ++++- arch/x86/kernel/dumpstack_64.c | 7 ++++--- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5962176dfab..6b1f6f6f866 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -30,6 +30,37 @@ void printk_address(unsigned long address, int reliable) reliable ? "" : "? ", (void *) address); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + /* * x86-64 can have up to three kernel stacks: * process stack @@ -54,7 +85,7 @@ unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end) + unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -70,6 +101,7 @@ print_context_stack(struct thread_info *tinfo, } else { ops->address(data, addr, bp == 0); } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } stack++; } diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 3119a801c32..da87590b869 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -18,7 +18,7 @@ extern unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end); + unsigned long *end, int *graph); extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7b031b106ec..d593cd1f58d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -23,6 +23,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -50,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 33ff10287a5..c302d070704 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -109,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -149,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the @@ -168,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -186,7 +187,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.3 From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:05 -0500 Subject: ftrace: function graph return for function entry Impact: feature, let entry function decide to trace or not This patch lets the graph tracer entry function decide if the tracing should be done at the end as well. This requires all function graph entry functions return 1 if it should trace, or 0 if the return should not be traced. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ arch/x86/kernel/ftrace.c | 7 ++++++- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 10 +++++++--- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 2 +- 7 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 826682abed1..43ceb3f454b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1196,6 +1196,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9060ba6497e..54e0bbdccb9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -120,6 +120,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index adba8e9a427..d278ad2ebda 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -425,6 +425,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; + barrier(); current->curr_ret_stack--; } @@ -506,7 +507,11 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } trace.func = self_addr; - ftrace_graph_entry(&trace); + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 58ca1c3a3f4..469ceb3e85b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -371,7 +371,7 @@ struct ftrace_graph_ret { #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ -typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ +typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a44af05ae2d..65b9e863056 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1636,11 +1636,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, static atomic_t ftrace_graph_active; +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; -trace_func_graph_ent_t ftrace_graph_entry = - (trace_func_graph_ent_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -1738,7 +1742,7 @@ void unregister_ftrace_graph(void) atomic_dec(&ftrace_graph_active); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; - ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); mutex_unlock(&ftrace_sysctl_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 380de630ebc..8b6409a62b5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1200,7 +1200,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -void trace_graph_entry(struct ftrace_graph_ent *trace) +int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1219,6 +1219,8 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) } atomic_dec(&data->disabled); local_irq_restore(flags); + + return 1; } void trace_graph_return(struct ftrace_graph_ret *trace) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f96f4e787ff..0565ae9a221 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -412,7 +412,7 @@ void trace_function(struct trace_array *tr, unsigned long flags, int pc); void trace_graph_return(struct ftrace_graph_ret *trace); -void trace_graph_entry(struct ftrace_graph_ent *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to); -- cgit v1.2.3 From 62679efe0a5f02987a621942afc5979a80a6ca5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:06 -0500 Subject: ftrace: add checks on ret stack in function graph Import: robustness checks Add more checks in the function graph code to detect errors and perhaps print out better information if a bug happens. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d278ad2ebda..f98c4076a17 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -420,6 +420,15 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) int index; index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + *ret = current->ret_stack[index].ret; trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; @@ -427,6 +436,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->depth = index; barrier(); current->curr_ret_stack--; + } /* @@ -442,6 +452,13 @@ unsigned long ftrace_return_to_handler(void) trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_graph_return(&trace); + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + return ret; } -- cgit v1.2.3 From 11e84acc400921743cc8d488e4a265cd98a655c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Dec 2008 02:30:37 +0100 Subject: tracing/function-graph-tracer: display unified style cmdline and pid Impact: extend function-graph output: let one know which thread called a function This patch implements a helper function to print the couple cmdline/pid. Its output is provided during task switching and on each row if the new "funcgraph-proc" defualt-off option is set through trace_options file. The output is center aligned and never exceeds 14 characters. The cmdline is truncated over 7 chars. But note that if the pid exceeds 6 characters, the column will overflow (but the situation is abnormal). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 113 ++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 894b50bca31..23e19d2dbd0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -19,6 +19,7 @@ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 static struct tracer_opt trace_opts[] = { /* Display overruns ? */ @@ -27,11 +28,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, /* Display Overhead ? */ { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns by default */ + /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, .opts = trace_opts }; @@ -104,23 +107,63 @@ print_graph_cpu(struct trace_seq *s, int cpu) return TRACE_TYPE_HANDLED; } +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static enum print_line_t +print_graph_proc(struct trace_seq *s, pid_t pid) +{ + int i; + int ret; + int len; + char comm[8]; + int spaces = 0; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + + strncpy(comm, trace_find_cmdline(pid), 7); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%s-%s", comm, pid_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + /* If the pid changed since the last trace, output this event */ -static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) +static enum print_line_t +verif_pid(struct trace_seq *s, pid_t pid, int cpu) { - char *comm, *prev_comm; pid_t prev_pid; int ret; if (last_pid[cpu] != -1 && last_pid[cpu] == pid) - return 1; + return TRACE_TYPE_HANDLED; prev_pid = last_pid[cpu]; last_pid[cpu] = pid; - comm = trace_find_cmdline(pid); - prev_comm = trace_find_cmdline(prev_pid); - /* * Context-switch trace line: @@ -130,11 +173,31 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) */ ret = trace_seq_printf(s, - " ------------------------------------------\n"); - ret += trace_seq_printf(s, " | %d) %s-%d => %s-%d\n", - cpu, prev_comm, prev_pid, comm, pid); - ret += trace_seq_printf(s, - " ------------------------------------------\n\n"); + "\n ------------------------------------------\n |"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + return ret; } @@ -288,12 +351,23 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_entry *ent = iter->ent; /* Pid */ - if (!verif_pid(s, ent->pid, cpu)) + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -318,12 +392,23 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, duration = 9999999ULL; /* Pid */ - if (!verif_pid(s, ent->pid, cpu)) + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v1.2.3 From 166d3c7994d79ab3f78f420607283361ff5cce79 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Dec 2008 02:32:12 +0100 Subject: tracing/function-graph-tracer: improve duration output Impact: better trace output of duration for long calls The old duration output didn't exceeded 9999.999 us to fit the column and the nanosecs were always 3 numbers. As Ingo suggested, it's better to have the whole microseconds elapsed time and shift the nanosecs precision if needed to fit the maximum 7 numbers. And usec need more number, the case should be rare and important enough to break a bit the column alignment to show it. So, depending of the duration value, we now have these patterns: u.nnn us uu.nnn us uuu.nnn us uuuu.nnn us uuuuu.nn us uuuuuu.n us uuuuuuuu..... us Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 55 ++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23e19d2dbd0..c66578f2fdc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -232,11 +232,50 @@ trace_branch_is_leaf(struct trace_iterator *iter, } -static inline int +static enum print_line_t print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); + /* log10(ULONG_MAX) + '\0' */ + char msecs_str[21]; + char nsecs_str[5]; + int ret, len; + int i; + + sprintf(msecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + ret = trace_seq_printf(s, msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + len = strlen(msecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + ret = trace_seq_printf(s, ".%s", nsecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + len += strlen(nsecs_str); + } + + ret = trace_seq_printf(s, " us "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "| "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } /* Signal a overhead of time execution to the output */ @@ -273,10 +312,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; - /* Must not exceed 8 characters: 9999.999 us */ - if (duration > 10000000ULL) - duration = 9999999ULL; - /* Overhead */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { ret = print_graph_overhead(duration, s); @@ -286,7 +321,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Duration */ ret = print_graph_duration(duration, s); - if (!ret) + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Function */ @@ -387,10 +422,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; unsigned long long duration = trace->rettime - trace->calltime; - /* Must not exceed 8 characters: xxxx.yyy us */ - if (duration > 10000000ULL) - duration = 9999999ULL; - /* Pid */ if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -422,7 +453,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Duration */ ret = print_graph_duration(duration, s); - if (!ret) + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Closing brace */ -- cgit v1.2.3 From 764f3b95131a7ce5c992e3d00caf590fcada2f7b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:33:58 +0100 Subject: tracing/function-graph-tracer: enabled by default CONFIG_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER already, (turning it non-default) so it so making it default-n is pointless. So enable it by default - it's a nice extension of the function tracer. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b6b673b4d6..bde6f03512d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER + default y help Enable the kernel to trace a function at both its return and its entry. -- cgit v1.2.3 From 0a37119d963e876ca86912497346ec50dea2541b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 11:04:50 -0500 Subject: trace: fix output of stack trace Impact: fix to output of stack trace If a function is not found in the stack of the stack tracer, the number printed is quite strange. This fixes the algorithm to handle missing functions better. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 06a16115be0..0b863f2cbc8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -78,6 +78,7 @@ static inline void check_stack(void) * on a new max, so it is far from a fast path. */ while (i < max_stack_trace.nr_entries) { + int found = 0; stack_dump_index[i] = this_size; p = start; @@ -86,12 +87,14 @@ static inline void check_stack(void) if (*p == stack_dump_trace[i]) { this_size = stack_dump_index[i++] = (top - p) * sizeof(unsigned long); + found = 1; /* Start the search from here */ start = p + 1; } } - i++; + if (!found) + i++; } out: -- cgit v1.2.3 From e8e1abe92fd7ea9d823a3aaf81d10e2cba593b6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 11:04:51 -0500 Subject: ftrace: fix race in function graph during fork Impact: graph tracer race/crash fix There is a nasy race in startup of a new process running the function graph tracer. In fork.c: total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; The new task is free to run as soon as the tasklist_lock is released. This is before the ftrace_graph_init_task. If the task does run it will be using the same ret_stack and curr_ret_stack as the parent. This will cause crashes that are difficult to debug. This patch moves the ftrace_graph_init_task to just after the alloc_pid code. This fixes the above race. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5f82a999c03..7407ab31987 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1137,6 +1137,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1145,7 +1147,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_pid; + goto bad_fork_free_graph; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1238,7 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { @@ -1271,11 +1273,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); -- cgit v1.2.3