From c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 11:28:21 +0200 Subject: debug lockups: Improve lockup detection When debugging a recent lockup bug i found various deficiencies in how our current lockup detection helpers work: - SysRq-L is not very efficient as it uses a workqueue, hence it cannot punch through hard lockups and cannot see through most soft lockups either. - The SysRq-L code depends on the NMI watchdog - which is off by default. - We dont print backtraces from the RCU code's built-in 'RCU state machine is stuck' debug code. This debug code tends to be one of the first (and only) mechanisms that show that a lockup has occured. This patch changes the code so taht we: - Trigger the NMI backtrace code from SysRq-L instead of using a workqueue (which cannot punch through hard lockups) - Trigger print-all-CPU-backtraces from the RCU lockup detection code Also decouple the backtrace printing code from the NMI watchdog: - Dont use variable size cpumasks (it might not be initialized and they are a bit more fragile anyway) - Trigger an NMI immediately via an IPI, instead of waiting for the NMI tick to occur. This is a lot faster and can produce more relevant backtraces. It will also work if the NMI watchdog is disabled. - Dont print the 'dazed and confused' message when we print a backtrace from the NMI - Do a show_regs() plus a dump_stack() to get maximum info out of the dump. Worst-case we get two stacktraces - which is not a big deal. Sometimes, if register content is corrupted, the precise stack walker in show_regs() wont give us a full backtrace - in this case dump_stack() will do it. Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- drivers/char/sysrq.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/char') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d7a02f63e1..165f307f30e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -222,12 +223,7 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - struct pt_regs *regs = get_irq_regs(); - if (regs) { - printk(KERN_INFO "CPU%d:\n", smp_processor_id()); - show_regs(regs); - } - schedule_work(&sysrq_showallcpus); + trigger_all_cpu_backtrace(); } static struct sysrq_key_op sysrq_showallcpus_op = { -- cgit v1.2.3 From 47cab6a722d44c71c4f8224017ef548522243cf4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Aug 2009 09:31:54 +0200 Subject: debug lockups: Improve lockup detection, fix generic arch fallback As Andrew noted, my previous patch ("debug lockups: Improve lockup detection") broke/removed SysRq-L support from architecture that do not provide a __trigger_all_cpu_backtrace implementation. Restore a fallback path and clean up the SysRq-L machinery a bit: - Rename the arch method to arch_trigger_all_cpu_backtrace() - Simplify the define - Document the method a bit - in the hope of more architectures adding support for it. [ The patch touches Sparc code for the rename. ] Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: "David S. Miller" LKML-Reference: <20090802140809.7ec4bb6b.akpm@linux-foundation.org> Signed-off-by: Ingo Molnar --- drivers/char/sysrq.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers/char') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 165f307f30e..50eecfe1d72 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -223,7 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - trigger_all_cpu_backtrace(); + /* + * Fall back to the workqueue based printing if the + * backtrace printing did not succeed or the + * architecture has no support for it: + */ + if (!trigger_all_cpu_backtrace()) { + struct pt_regs *regs = get_irq_regs(); + + if (regs) { + printk(KERN_INFO "CPU%d:\n", smp_processor_id()); + show_regs(regs); + } + schedule_work(&sysrq_showallcpus); + } } static struct sysrq_key_op sysrq_showallcpus_op = { -- cgit v1.2.3