diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-12-28 12:21:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-12-28 12:21:10 -0800 |
commit | b0f4b285d7ed174804658539129a834270f4829a (patch) | |
tree | be7f8dca58075aba2c6a137fcfd4d44c5c333efc /kernel | |
parent | be9c5ae4eeec2e85527e95647348b8ea4eb25128 (diff) | |
parent | 5250d329e38cdf7580faeb9c53c17d3588d7d19c (diff) |
Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (241 commits)
sched, trace: update trace_sched_wakeup()
tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3
Revert "x86: disable X86_PTRACE_BTS"
ring-buffer: prevent false positive warning
ring-buffer: fix dangling commit race
ftrace: enable format arguments checking
x86, bts: memory accounting
x86, bts: add fork and exit handling
ftrace: introduce tracing_reset_online_cpus() helper
tracing: fix warnings in kernel/trace/trace_sched_switch.c
tracing: fix warning in kernel/trace/trace.c
tracing/ring-buffer: remove unused ring_buffer size
trace: fix task state printout
ftrace: add not to regex on filtering functions
trace: better use of stack_trace_enabled for boot up code
trace: add a way to enable or disable the stack tracer
x86: entry_64 - introduce FTRACE_ frame macro v2
tracing/ftrace: add the printk-msg-only option
tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()
x86, bts: correctly report invalid bts records
...
Fixed up trivial conflict in scripts/recordmcount.pl due to SH bits
being already partly merged by the SH merge.
Diffstat (limited to 'kernel')
35 files changed, 4795 insertions, 1003 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index ccb87162ff6..c7422ca9203 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -55,6 +55,10 @@ #include <asm/mmu_context.h> #include "cred-internals.h" +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); + static void exit_mm(struct task_struct * tsk); static inline int task_detached(struct task_struct *p) @@ -1127,7 +1131,6 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/extable.c b/kernel/extable.c index a26cb2e1702..feb0317cf09 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,6 +17,7 @@ */ #include <linux/module.h> #include <linux/init.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/sections.h> @@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -int core_kernel_text(unsigned long addr) +__notrace_funcgraph int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr) return 0; } -int __kernel_text_address(unsigned long addr) +__notrace_funcgraph int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; diff --git a/kernel/fork.c b/kernel/fork.c index 4e8ca23c0ed..6144b36cd89 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> +#include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> @@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +DEFINE_TRACE(sched_process_fork); + int nr_processes(void) { int cpu; @@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1080,6 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif + if (unlikely(ptrace_reparented(current))) + ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); @@ -1120,6 +1126,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1128,7 +1136,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_pid; + goto bad_fork_free_graph; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1221,7 +1229,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { @@ -1258,6 +1266,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e7a7ce3ed0..4fbc456f393 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +DEFINE_TRACE(sched_kthread_stop); +DEFINE_TRACE(sched_kthread_stop_ret); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 46a404173db..74b1878b8bb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -25,6 +25,7 @@ * Thanks to Arjan van de Ven for coming up with the initial idea of * mapping lock dependencies runtime. */ +#define DISABLE_BRANCH_PROFILING #include <linux/mutex.h> #include <linux/sched.h> #include <linux/delay.h> diff --git a/kernel/marker.c b/kernel/marker.c index e9c6b2bc940..ea54f264786 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex); */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /* * Note about RCU : @@ -64,11 +65,10 @@ struct marker_entry { void *oldptr; int rcu_pending; unsigned char ptype:1; + unsigned char format_allocated:1; char name[0]; /* Contains name'\0'format'\0' */ }; -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - /** * __mark_empty_function - Empty probe callback * @probe_private: probe private data @@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(void *probe_private, void *call_private, +notrace void __mark_empty_function(void *probe_private, void *call_private, const char *fmt, va_list *args) { } @@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, ...) +notrace void marker_probe_cb(const struct marker *mdata, + void *call_private, ...) { va_list args; char ptype; @@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * sure the teardown of the callbacks can be done correctly when they * are in modules and they insure RCU read coherency. */ - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +static notrace void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...) { va_list args; /* not initialized */ char ptype; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } -EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); static void free_old_closure(struct rcu_head *head) { @@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->single.probe_private = NULL; e->multi = NULL; e->ptype = 0; + e->format_allocated = 0; e->refcount = 0; e->rcu_pending = 0; hlist_add_head(&e->hlist, head); @@ -447,6 +449,8 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); + if (e->format_allocated) + kfree(e->format); /* Make sure the call_rcu has been executed */ if (e->rcu_pending) rcu_barrier_sched(); @@ -457,57 +461,34 @@ static int remove_marker(const char *name) /* * Set the mark_entry format to the format found in the element. */ -static int marker_set_format(struct marker_entry **entry, const char *format) +static int marker_set_format(struct marker_entry *entry, const char *format) { - struct marker_entry *e; - size_t name_len = strlen((*entry)->name) + 1; - size_t format_len = strlen(format) + 1; - - - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) return -ENOMEM; - memcpy(&e->name[0], (*entry)->name, name_len); - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - e->single = (*entry)->single; - e->multi = (*entry)->multi; - e->ptype = (*entry)->ptype; - e->refcount = (*entry)->refcount; - e->rcu_pending = 0; - hlist_add_before(&e->hlist, &(*entry)->hlist); - hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); - kfree(*entry); - *entry = e; + entry->format_allocated = 1; + trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); + entry->name, entry->format); return 0; } /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem, +static int set_marker(struct marker_entry *entry, struct marker *elem, int active) { - int ret; - WARN_ON(strcmp((*entry)->name, elem->name) != 0); + int ret = 0; + WARN_ON(strcmp(entry->name, elem->name) != 0); - if ((*entry)->format) { - if (strcmp((*entry)->format, elem->format) != 0) { + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { printk(KERN_NOTICE "Format mismatch for probe %s " "(%s), marker (%s)\n", - (*entry)->name, - (*entry)->format, + entry->name, + entry->format, elem->format); return -EPERM; } @@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * pass from a "safe" callback (with argument) to an "unsafe" * callback (does not set arguments). */ - elem->call = (*entry)->call; + elem->call = entry->call; /* * Sanity check : * We only update the single probe private data when the ptr is * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) */ WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private - != (*entry)->single.probe_private && - !elem->ptype); - elem->single.probe_private = (*entry)->single.probe_private; + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; /* * Make sure the private data is valid when we update the * single probe ptr. */ smp_wmb(); - elem->single.func = (*entry)->single.func; + elem->single.func = entry->single.func; /* * We also make sure that the new probe callbacks array is consistent * before setting a pointer to it. */ - rcu_assign_pointer(elem->multi, (*entry)->multi); + rcu_assign_pointer(elem->multi, entry->multi); /* * Update the function or multi probe array pointer before setting the * ptype. */ smp_wmb(); - elem->ptype = (*entry)->ptype; + elem->ptype = entry->ptype; + + if (elem->tp_name && (active ^ elem->state)) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + + if (active) { + /* + * try_module_get should always succeed because we hold + * lock_module() to get the tp_cb address. + */ + ret = try_module_get(__module_text_address( + (unsigned long)elem->tp_cb)); + BUG_ON(!ret); + ret = tracepoint_probe_register_noupdate( + elem->tp_name, + elem->tp_cb); + } else { + ret = tracepoint_probe_unregister_noupdate( + elem->tp_name, + elem->tp_cb); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address( + (unsigned long)elem->tp_cb)); + } + } elem->state = active; - return 0; + return ret; } /* @@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, */ static void disable_marker(struct marker *elem) { + int ret; + /* leave "call" as is. It is known statically. */ + if (elem->tp_name && elem->state) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + ret = tracepoint_probe_unregister_noupdate(elem->tp_name, + elem->tp_cb); + WARN_ON(ret); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address((unsigned long)elem->tp_cb)); + } elem->state = 0; elem->single.func = __mark_empty_function; /* Update the function before setting the ptype */ @@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin, for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); if (mark_entry) { - set_marker(&mark_entry, iter, - !!mark_entry->refcount); + set_marker(mark_entry, iter, !!mark_entry->refcount); /* * ignore error, continue */ @@ -629,6 +656,7 @@ static void marker_update_probes(void) marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ module_update_markers(); + tracepoint_probe_update_all(); } /** @@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format, ret = PTR_ERR(entry); } else if (format) { if (!entry->format) - ret = marker_set_format(&entry, format); + ret = marker_set_format(entry, format); else if (strcmp(entry->format, format)) ret = -EPERM; } @@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format, goto end; } mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; @@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) @@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; @@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, if (!e->ptype) { if (num == 0 && e->single.func == probe) return e->single.probe_private; - else - break; } else { struct marker_probe_closure *closure; int match = 0; @@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, return closure[i].probe_private; } } + break; } } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(marker_get_private_data); + +#ifdef CONFIG_MODULES + +int marker_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + case MODULE_STATE_GOING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + } + return 0; +} + +struct notifier_block marker_module_nb = { + .notifier_call = marker_module_notify, + .priority = 0, +}; + +static int init_markers(void) +{ + return register_module_notifier(&marker_module_nb); +} +__initcall(init_markers); + +#endif /* CONFIG_MODULES */ diff --git a/kernel/module.c b/kernel/module.c index 1f4cc00e0c2..dd2a54155b5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod, struct mod_debug *debug; unsigned int num_debug; -#ifdef CONFIG_MARKERS - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); -#endif debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); dynamic_printk_setup(debug, num_debug); - -#ifdef CONFIG_TRACEPOINTS - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); -#endif } /* sechdrs[0].sh_size is always zero */ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", sizeof(*mseg), &num_mcount); - ftrace_init_module(mseg, mseg + num_mcount); + ftrace_init_module(mod, mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr) /* Is this a valid kernel address? */ -struct module *__module_text_address(unsigned long addr) +__notrace_funcgraph struct module *__module_text_address(unsigned long addr) { struct module *mod; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d74083746..f77d3819ef5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,7 +22,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "power.h" @@ -257,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error, ftrace_save; + int error; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -370,11 +367,10 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error, ftrace_save; + int error; pm_prepare_console(); suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: - __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error, ftrace_save; + int error; if (!hibernation_ops) return -ENOSYS; @@ -417,7 +412,6 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -452,7 +446,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index b8f7ce9473e..613f16941b8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,7 +22,6 @@ #include <linux/freezer.h> #include <linux/vmstat.h> #include <linux/syscalls.h> -#include <linux/ftrace.h> #include "power.h" @@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error, ftrace_save; + int error; if (!suspend_ops) return -ENOSYS; @@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/profile.c b/kernel/profile.c index dc41827fbfe..60adefb59b5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = { }; #ifdef CONFIG_SMP -static inline void profile_nop(void *unused) +static void profile_nop(void *unused) { } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ca2df68faf7..29dc700e198 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,17 @@ #include <asm/pgtable.h> #include <asm/uaccess.h> + +/* + * Initialize a new task whose father had been ptraced. + * + * Called from copy_process(). + */ +void ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + arch_ptrace_fork(child, clone_flags); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } diff --git a/kernel/sched.c b/kernel/sched.c index 33cf4a1cbcd..3798b954e6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -118,6 +118,12 @@ */ #define RUNTIME_INF ((u64)~0ULL) +DEFINE_TRACE(sched_wait_task); +DEFINE_TRACE(sched_wakeup); +DEFINE_TRACE(sched_wakeup_new); +DEFINE_TRACE(sched_switch); +DEFINE_TRACE(sched_migrate_task); + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -1847,6 +1853,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@ -2318,7 +2326,7 @@ out_activate: success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@ -2451,7 +2459,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2864,7 +2872,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -5912,6 +5919,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; + ftrace_graph_init_task(idle); } /* diff --git a/kernel/signal.c b/kernel/signal.c index 2a64304ed54..8e95855ff3c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,6 +41,8 @@ static struct kmem_cache *sigqueue_cachep; +DEFINE_TRACE(sched_signal_send); + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d52b57310a..0b627d9c93d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,6 +487,26 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_STACK_TRACER + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 33dbefd471e..e2a4ff6fc3a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,18 +3,34 @@ # select HAVE_FUNCTION_TRACER: # +config USER_STACKTRACE_SUPPORT + bool + config NOP_TRACER bool config HAVE_FUNCTION_TRACER bool +config HAVE_FUNCTION_GRAPH_TRACER + bool + +config HAVE_FUNCTION_TRACE_MCOUNT_TEST + bool + help + This gets selected when the arch tests the function_trace_stop + variable at the mcount call site. Otherwise, this variable + is tested by the called function. + config HAVE_DYNAMIC_FTRACE bool config HAVE_FTRACE_MCOUNT_RECORD bool +config HAVE_HW_BRANCH_TRACER + bool + config TRACER_MAX_TRACE bool @@ -47,6 +63,20 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER + depends on FUNCTION_TRACER + default y + help + Enable the kernel to trace a function at both its return + and its entry. + It's first purpose is to trace the duration of functions and + draw a call graph for each thread with some informations like + the return value. + This is done by setting the current return address on the current + task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -138,6 +168,70 @@ config BOOT_TRACER selected, because the self-tests are an initcall as well and that would invalidate the boot trace. ) +config TRACE_BRANCH_PROFILING + bool "Trace likely/unlikely profiler" + depends on DEBUG_KERNEL + select TRACING + help + This tracer profiles all the the likely and unlikely macros + in the kernel. It will display the results in: + + /debugfs/tracing/profile_annotated_branch + + Note: this will add a significant overhead, only turn this + on if you need to profile the system's use of these macros. + + Say N if unsure. + +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" + depends on TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /debugfs/tracing/profile_branch + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed + + Say N if unsure. + +config TRACING_BRANCHES + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config BRANCH_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + +config POWER_TRACER + bool "Trace power consumption behavior" + depends on DEBUG_KERNEL + depends on X86 + select TRACING + help + This tracer helps developers to analyze and optimize the kernels + power management decisions, specifically the C-state and P-state + behavior. + + config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -150,13 +244,26 @@ config STACK_TRACER This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and - stack-trace saved. Because this logic has to execute in every - kernel function, all the time, this option can slow down the - kernel measurably and is generally intended for kernel - developers only. + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled Say N if unsure. +config HW_BRANCH_TRACER + depends on HAVE_HW_BRANCH_TRACER + bool "Trace hw branches" + select TRACING + help + This tracer records all branches on the system in a circular + buffer giving access to the last N branches for each cpu. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c8228b1a49e..349d5a93653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +endif + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o @@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o +obj-$(CONFIG_POWER_TRACER) += trace_power.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 78db083390f..2f32969c09d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,6 +47,13 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +/* Quick disabling of function tracer. */ +int function_trace_stop; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); +static DEFINE_MUTEX(ftrace_start_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { @@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) }; } +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +{ + if (!test_tsk_trace_trace(current)) + return; + + ftrace_pid_function(ip, parent_ip); +} + +static void set_ftrace_pid_function(ftrace_func_t func) +{ + /* do not set ftrace_pid_function to itself! */ + if (func != ftrace_pid_func) + ftrace_pid_function = func; +} + /** * clear_ftrace_function - reset the ftrace function * @@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; + __ftrace_trace_function = ftrace_stub; + ftrace_pid_function = ftrace_stub; +} + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST +/* + * For those archs that do not test ftrace_trace_stop in their + * mcount call site, we need to do it from C. + */ +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +{ + if (function_trace_stop) + return; + + __ftrace_trace_function(ip, parent_ip); } +#endif static int __register_ftrace_function(struct ftrace_ops *ops) { @@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops) ftrace_list = ops; if (ftrace_enabled) { + ftrace_func_t func; + + if (ops->next == &ftrace_list_end) + func = ops->func; + else + func = ftrace_list_func; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + /* * For one func, simply call it directly. * For more than one func, call the chain. */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; +#endif } spin_unlock(&ftrace_lock); @@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) { /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_list->next == &ftrace_list_end) { + ftrace_func_t func = ftrace_list->func; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + } } out: @@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return ret; } +static void ftrace_update_pid_func(void) +{ + ftrace_func_t func; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + if (ftrace_trace_function == ftrace_stub) + goto out; + + func = ftrace_trace_function; + + if (ftrace_pid_trace) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } else { + if (func == ftrace_pid_func) + func = ftrace_pid_function; + } + +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + + out: + spin_unlock(&ftrace_lock); +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD @@ -182,6 +277,8 @@ enum { FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_ENABLE_MCOUNT = (1 << 3), FTRACE_DISABLE_MCOUNT = (1 << 4), + FTRACE_START_FUNC_RET = (1 << 5), + FTRACE_STOP_FUNC_RET = (1 << 6), }; static int ftrace_filtered; @@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *rec; - if (!ftrace_enabled || ftrace_disabled) + if (ftrace_disabled) return NULL; rec = ftrace_alloc_dyn_node(ip); @@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip) return rec; } -#define FTRACE_ADDR ((long)(ftrace_caller)) +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + +static void ftrace_bug(int failed, unsigned long ip) +{ + switch (failed) { + case -EFAULT: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case -EINVAL: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" actual: ", (unsigned char *)ip); + printk(KERN_CONT "\n"); + break; + case -EPERM: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); + } +} + static int -__ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *nop, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ip, fl; - unsigned char *call, *old, *new; + unsigned long ftrace_addr; + + ftrace_addr = (unsigned long)ftrace_caller; ip = rec->ip; @@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } - call = ftrace_call_replace(ip, FTRACE_ADDR); - - if (rec->flags & FTRACE_FL_ENABLED) { - old = nop; - new = call; - } else { - old = call; - new = nop; - } - - return ftrace_modify_code(ip, old, new); + if (rec->flags & FTRACE_FL_ENABLED) + return ftrace_make_call(rec, ftrace_addr); + else + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) { int i, failed; - unsigned char *nop = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - nop = ftrace_nop_replace(); - for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; - /* don't modify code that has already faulted */ - if (rec->flags & FTRACE_FL_FAILED) + /* + * Skip over free records and records that have + * failed. + */ + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED) continue; /* ignore updates to this record's mcount site */ @@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable) unfreeze_record(rec); } - failed = __ftrace_replace_code(rec, nop, enable); + failed = __ftrace_replace_code(rec, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); - } + } else + ftrace_bug(failed, rec->ip); } } } } -static void print_ip_ins(const char *fmt, unsigned char *p) -{ - int i; - - printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); -} - static int -ftrace_code_disable(struct dyn_ftrace *rec) +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { unsigned long ip; - unsigned char *nop, *call; int ret; ip = rec->ip; - nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, mcount_addr); - - ret = ftrace_modify_code(ip, call, nop); + ret = ftrace_make_nop(mod, rec, mcount_addr); if (ret) { - switch (ret) { - case -EFAULT: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); - break; - case -EINVAL: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace failed to modify "); - print_ip_sym(ip); - print_ip_ins(" expected: ", call); - print_ip_ins(" actual: ", (unsigned char *)ip); - print_ip_ins(" replace: ", nop); - printk(KERN_CONT "\n"); - break; - case -EPERM: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on writing "); - print_ip_sym(ip); - break; - default: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); - } - + ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data) if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); + if (*command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (*command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + return 0; } @@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command) } static ftrace_func_t saved_ftrace_func; -static int ftrace_start; -static DEFINE_MUTEX(ftrace_start_lock); +static int ftrace_start_up; -static void ftrace_startup(void) +static void ftrace_startup_enable(int command) { - int command = 0; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftrace_start_lock); - ftrace_start++; - command |= FTRACE_ENABLE_CALLS; - if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; command |= FTRACE_UPDATE_TRACE_FUNC; } if (!command || !ftrace_enabled) - goto out; + return; ftrace_run_update_code(command); - out: - mutex_unlock(&ftrace_start_lock); } -static void ftrace_shutdown(void) +static void ftrace_startup(int command) { - int command = 0; + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_start_lock); + ftrace_start_up++; + command |= FTRACE_ENABLE_CALLS; + ftrace_startup_enable(command); + + mutex_unlock(&ftrace_start_lock); +} + +static void ftrace_shutdown(int command) +{ if (unlikely(ftrace_disabled)) return; mutex_lock(&ftrace_start_lock); - ftrace_start--; - if (!ftrace_start) + ftrace_start_up--; + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void) mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftrace_start is true if we want ftrace running */ - if (ftrace_start) + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); @@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void) return; mutex_lock(&ftrace_start_lock); - /* ftrace_start is true if ftrace is running */ - if (ftrace_start) + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); @@ -605,7 +700,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void) +static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p, *t; cycle_t start, stop; @@ -622,7 +717,7 @@ static int ftrace_update_code(void) list_del_init(&p->list); /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { + if (ftrace_code_disable(mod, p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; } else @@ -690,7 +785,6 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t pos; struct ftrace_page *pg; unsigned idx; unsigned flags; @@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; + } else { + iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } spin_unlock(&ftrace_lock); - iter->pos = *pos; - return rec; } @@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; - loff_t l = -1; - if (*pos > iter->pos) - *pos = iter->pos; + if (*pos > 0) { + if (iter->idx < 0) + return p; + (*pos)--; + iter->idx--; + } - l = *pos; - p = t_next(m, p, &l); + p = t_next(m, p, pos); return p; } @@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - ret = seq_printf(m, "%s\n", str); - if (ret < 0) { - iter->pos--; - iter->idx--; - } + seq_printf(m, "%s\n", str); return 0; } @@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; @@ -959,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable) int type = MATCH_FULL; unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; + int not = 0; + + if (buff[0] == '!') { + not = 1; + buff++; + len--; + } for (i = 0; i < len; i++) { if (buff[i] == '*') { @@ -1012,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable) matched = 1; break; } - if (matched) - rec->flags |= flag; + if (matched) { + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; + } } pg = pg->next; } @@ -1181,7 +1280,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (ftrace_start && ftrace_enabled) + if (ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); @@ -1233,12 +1332,233 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; -static __init int ftrace_init_debugfs(void) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +int ftrace_graph_count; +unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) { - struct dentry *d_tracer; - struct dentry *entry; + unsigned long *array = m->private; + int index = *pos; - d_tracer = tracing_init_dentry(); + (*pos)++; + + if (index >= ftrace_graph_count) + return NULL; + + return &array[index]; +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + void *p = NULL; + + mutex_lock(&graph_lock); + + p = g_next(m, p, pos); + + return p; +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + unsigned long *ptr = v; + char str[KSYM_SYMBOL_LEN]; + + if (!ptr) + return 0; + + kallsyms_lookup(*ptr, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&graph_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) { + ftrace_graph_count = 0; + memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &ftrace_graph_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ftrace_graph_funcs; + } + } else + file->private_data = ftrace_graph_funcs; + mutex_unlock(&graph_lock); + + return ret; +} + +static ssize_t +ftrace_graph_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static int +ftrace_set_func(unsigned long *array, int idx, char *buffer) +{ + char str[KSYM_SYMBOL_LEN]; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int found = 0; + int i, j; + + if (ftrace_disabled) + return -ENODEV; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + continue; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (strcmp(str, buffer) == 0) { + found = 1; + for (j = 0; j < idx; j++) + if (array[j] == rec->ip) { + found = 0; + break; + } + if (found) + array[idx] = rec->ip; + break; + } + } + } + spin_unlock(&ftrace_lock); + + return found ? 0 : -EINVAL; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned long *array; + size_t read = 0; + ssize_t ret; + int index = 0; + char ch; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&graph_lock); + + if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { + ret = -EBUSY; + goto out; + } + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + array = m->private; + } else + array = file->private_data; + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + while (cnt && !isspace(ch)) { + if (index < FTRACE_BUFF_MAX) + buffer[index++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + buffer[index] = 0; + + /* we allow only one at a time */ + ret = ftrace_set_func(array, ftrace_graph_count, buffer); + if (ret) + goto out; + + ftrace_graph_count++; + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&graph_lock); + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = ftrace_graph_read, + .write = ftrace_graph_write, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; entry = debugfs_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); @@ -1263,12 +1583,20 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + NULL, + &ftrace_graph_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_graph_function' entry\n"); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + return 0; } -fs_initcall(ftrace_init_debugfs); - -static int ftrace_convert_nops(unsigned long *start, +static int ftrace_convert_nops(struct module *mod, + unsigned long *start, unsigned long *end) { unsigned long *p; @@ -1279,23 +1607,32 @@ static int ftrace_convert_nops(unsigned long *start, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) + continue; ftrace_record_ip(addr); } /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(); + ftrace_update_code(mod); local_irq_restore(flags); mutex_unlock(&ftrace_start_lock); return 0; } -void ftrace_init_module(unsigned long *start, unsigned long *end) +void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(start, end); + ftrace_convert_nops(mod, start, end); } extern unsigned long __start_mcount_loc[]; @@ -1325,7 +1662,8 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(__start_mcount_loc, + ret = ftrace_convert_nops(NULL, + __start_mcount_loc, __stop_mcount_loc); return; @@ -1342,12 +1680,186 @@ static int __init ftrace_nodyn_init(void) } device_initcall(ftrace_nodyn_init); -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_enable(int command) { } +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(command) do { } while (0) +# define ftrace_shutdown(command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +static ssize_t +ftrace_pid_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + if (ftrace_pid_trace == ftrace_swapper_pid) + r = sprintf(buf, "swapper tasks\n"); + else if (ftrace_pid_trace) + r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); + else + r = sprintf(buf, "no pid\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void clear_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + clear_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void set_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + set_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void clear_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + put_pid(pid); +} + +static void set_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + set_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); +} + +static void clear_ftrace_pid_task(struct pid **pid) +{ + if (*pid == ftrace_swapper_pid) + clear_ftrace_swapper(); + else + clear_ftrace_pid(*pid); + + *pid = NULL; +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + set_ftrace_swapper(); + else + set_ftrace_pid(pid); +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct pid *pid; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtol(buf, 10, &val); + if (ret < 0) + return ret; + + mutex_lock(&ftrace_start_lock); + if (val < 0) { + /* disable pid tracing */ + if (!ftrace_pid_trace) + goto out; + + clear_ftrace_pid_task(&ftrace_pid_trace); + + } else { + /* swapper task is special */ + if (!val) { + pid = ftrace_swapper_pid; + if (pid == ftrace_pid_trace) + goto out; + } else { + pid = find_get_pid(val); + + if (pid == ftrace_pid_trace) { + put_pid(pid); + goto out; + } + } + + if (ftrace_pid_trace) + clear_ftrace_pid_task(&ftrace_pid_trace); + + if (!pid) + goto out; + + ftrace_pid_trace = pid; + + set_ftrace_pid_task(ftrace_pid_trace); + } + + /* update the function call */ + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + out: + mutex_unlock(&ftrace_start_lock); + + return cnt; +} + +static struct file_operations ftrace_pid_fops = { + .read = ftrace_pid_read, + .write = ftrace_pid_write, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + ftrace_init_dyn_debugfs(d_tracer); + + entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_pid' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + /** * ftrace_kill - kill ftrace * @@ -1381,10 +1893,11 @@ int register_ftrace_function(struct ftrace_ops *ops) return -1; mutex_lock(&ftrace_sysctl_lock); + ret = __register_ftrace_function(ops); - ftrace_startup(); - mutex_unlock(&ftrace_sysctl_lock); + ftrace_startup(0); + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1400,7 +1913,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(); + ftrace_shutdown(0); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -1449,3 +1962,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static atomic_t ftrace_graph_active; + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + t->curr_ret_stack = -1; + /* Make sure IRQs see the -1 first: */ + barrier(); + t->ret_stack = ret_stack_list[start++]; + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + kfree(ret_stack_list); + return ret; +} + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_sysctl_lock); + + atomic_inc(&ftrace_graph_active); + ret = start_graph_tracing(); + if (ret) { + atomic_dec(&ftrace_graph_active); + goto out; + } + + ftrace_graph_return = retfunc; + ftrace_graph_entry = entryfunc; + + ftrace_startup(FTRACE_START_FUNC_RET); + +out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_sysctl_lock); + + atomic_dec(&ftrace_graph_active); + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(FTRACE_STOP_FUNC_RET); + + mutex_unlock(&ftrace_sysctl_lock); +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + if (atomic_read(&ftrace_graph_active)) { + t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!t->ret_stack) + return; + t->curr_ret_stack = -1; + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + } else + t->ret_stack = NULL; +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +void ftrace_graph_stop(void) +{ + ftrace_stop(); +} +#endif + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 668bbb5ef2b..76f34c0ef29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -18,8 +18,46 @@ #include "trace.h" -/* Global flag to disable all recording to ring buffers */ -static int ring_buffers_off __read_mostly; +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + * This has two bits: ON, DISABLED + * + * ON DISABLED + * ---- ---------- + * 0 0 : ring buffers are off + * 1 0 : ring buffers are on + * X 1 : ring buffers are permanently disabled + */ + +enum { + RB_BUFFERS_ON_BIT = 0, + RB_BUFFERS_DISABLED_BIT = 1, +}; + +enum { + RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; /** * tracing_on - enable all tracing buffers @@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly; */ void tracing_on(void) { - ring_buffers_off = 0; + set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); } /** @@ -42,9 +80,22 @@ void tracing_on(void) */ void tracing_off(void) { - ring_buffers_off = 1; + clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); } +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanenty. + */ +void tracing_off_permanent(void) +{ + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); +} + +#include "trace.h" + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -56,7 +107,7 @@ u64 ring_buffer_time_stamp(int cpu) preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ time = sched_clock() << DEBUG_SHIFT; - preempt_enable_notrace(); + preempt_enable_no_resched_notrace(); return time; } @@ -144,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/* - * This hack stolen from mm/slob.c. - * We can store per page timing information in the page frame of the page. - * Thanks to Peter Zijlstra for suggesting this idea. - */ -struct buffer_page { +struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t write; /* index for next write */ local_t commit; /* write commited index */ + unsigned char data[]; /* data of buffer page */ +}; + +struct buffer_page { + local_t write; /* index for next write */ unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ - void *page; /* Actual data page */ + struct buffer_data_page *page; /* Actual data page */ }; +static void rb_init_page(struct buffer_data_page *bpage) +{ + local_set(&bpage->commit, 0); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -179,7 +234,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE PAGE_SIZE +#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) /* * head_page == tail_page && head == tail then buffer is empty. @@ -187,7 +242,8 @@ static inline int test_time_stamp(u64 delta) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t lock; + spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head pages; struct buffer_page *head_page; /* read from head */ @@ -202,7 +258,6 @@ struct ring_buffer_per_cpu { }; struct ring_buffer { - unsigned long size; unsigned pages; unsigned flags; int cpus; @@ -221,32 +276,16 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - } while (0) - -#define RB_WARN_ON_RET(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return -1; \ - } \ - } while (0) - -#define RB_WARN_ON_ONCE(buffer, cond) \ - do { \ - static int once; \ - if (unlikely(cond) && !once) { \ - once++; \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ atomic_inc(&buffer->record_disabled); \ WARN_ON(1); \ } \ - } while (0) + _____ret; \ + }) /** * check_pages - integrity check of buffer pages @@ -258,16 +297,20 @@ struct ring_buffer_iter { static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; - RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); - RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) + return -1; + if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + return -1; - list_for_each_entry_safe(page, tmp, head, list) { - RB_WARN_ON_RET(cpu_buffer, - page->list.next->prev != &page->list); - RB_WARN_ON_RET(cpu_buffer, - page->list.prev->next != &page->list); + list_for_each_entry_safe(bpage, tmp, head, list) { + if (RB_WARN_ON(cpu_buffer, + bpage->list.next->prev != &bpage->list)) + return -1; + if (RB_WARN_ON(cpu_buffer, + bpage->list.prev->next != &bpage->list)) + return -1; } return 0; @@ -277,22 +320,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; for (i = 0; i < nr_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); } list_splice(&pages, head); @@ -302,9 +346,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return 0; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } return -ENOMEM; } @@ -313,7 +357,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *page; + struct buffer_page *bpage; unsigned long addr; int ret; @@ -324,19 +368,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->lock); + spin_lock_init(&cpu_buffer->reader_lock); + cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto fail_free_buffer; - cpu_buffer->reader_page = page; + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) goto fail_free_reader; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -361,14 +407,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(page, tmp, head, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } kfree(cpu_buffer); } @@ -465,7 +511,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -473,13 +519,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; p = cpu_buffer->pages.next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - free_buffer_page(page); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + free_buffer_page(bpage); } - BUG_ON(list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; rb_reset_cpu(cpu_buffer); @@ -493,7 +541,7 @@ static void rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *pages, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -501,11 +549,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(pages))) + return; p = pages->next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - list_add_tail(&page->list, &cpu_buffer->pages); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + list_add_tail(&bpage->list, &cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -532,7 +581,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) { struct ring_buffer_per_cpu *cpu_buffer; unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long buffer_size; unsigned long addr; LIST_HEAD(pages); @@ -562,7 +611,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size < buffer_size) { /* easy case, just free pages */ - BUG_ON(nr_pages >= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } rm_pages = buffer->pages - nr_pages; @@ -581,21 +633,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - BUG_ON(nr_pages <= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } + new_pages = nr_pages - buffer->pages; for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; + bpage->page = (void *)addr; + rb_init_page(bpage->page); } } @@ -604,7 +661,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - BUG_ON(!list_empty(&pages)); + if (RB_WARN_ON(buffer, !list_empty(&pages))) { + mutex_unlock(&buffer->mutex); + return -1; + } out: buffer->pages = nr_pages; @@ -613,9 +673,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } mutex_unlock(&buffer->mutex); return -ENOMEM; @@ -626,9 +686,15 @@ static inline int rb_null_event(struct ring_buffer_event *event) return event->type == RINGBUF_TYPE_PADDING; } -static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +static inline void * +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) +{ + return bpage->data + index; +} + +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { - return page->page + index; + return bpage->page->data + index; } static inline struct ring_buffer_event * @@ -658,7 +724,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage) static inline unsigned rb_page_commit(struct buffer_page *bpage) { - return local_read(&bpage->commit); + return local_read(&bpage->page->commit); } /* Size is determined by what has been commited */ @@ -693,7 +759,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) head += rb_event_length(event)) { event = __rb_page_index(cpu_buffer->head_page, head); - BUG_ON(rb_null_event(event)); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; /* Only count data entries */ if (event->type != RINGBUF_TYPE_DATA) continue; @@ -703,14 +770,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) } static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **page) + struct buffer_page **bpage) { - struct list_head *p = (*page)->list.next; + struct list_head *p = (*bpage)->list.next; if (p == &cpu_buffer->pages) p = p->next; - *page = list_entry(p, struct buffer_page, list); + *bpage = list_entry(p, struct buffer_page, list); } static inline unsigned @@ -746,16 +813,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, addr &= PAGE_MASK; while (cpu_buffer->commit_page->page != (void *)addr) { - RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page); - cpu_buffer->commit_page->commit = + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page)) + return; + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; } /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->commit, index); + local_set(&cpu_buffer->commit_page->page->commit, index); } static inline void @@ -769,25 +838,38 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ + again: while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; barrier(); } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; } static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; cpu_buffer->reader_page->read = 0; } @@ -806,7 +888,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; } @@ -880,12 +962,15 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page; + struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); tail = write - length; @@ -894,7 +979,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (write > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); rb_inc_page(cpu_buffer, &next_page); @@ -902,14 +988,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, reader_page = cpu_buffer->reader_page; /* we grabbed the lock before incrementing */ - RB_WARN_ON(cpu_buffer, next_page == reader_page); + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_unlock; /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ - if (unlikely(next_page == cpu_buffer->commit_page)) { + if (unlikely(next_page == commit_page)) { WARN_ON_ONCE(1); goto out_unlock; } @@ -940,12 +1027,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); - local_set(&next_page->commit, 0); + local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; /* reread the time stamp */ *ts = ring_buffer_time_stamp(cpu_buffer->cpu); - cpu_buffer->tail_page->time_stamp = *ts; + cpu_buffer->tail_page->page->time_stamp = *ts; } /* @@ -970,7 +1057,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_set_commit_to_write(cpu_buffer); } - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -978,7 +1066,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - BUG_ON(write > BUF_PAGE_SIZE); + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); @@ -988,12 +1077,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * this page's time stamp. */ if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; return event; out_unlock: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return NULL; } @@ -1038,7 +1128,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; event->time_delta = 0; event->array[0] = 0; } @@ -1076,10 +1166,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * storm or we have something buggy. * Bail! */ - if (unlikely(++nr_loops > 1000)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - } ts = ring_buffer_time_stamp(cpu_buffer->cpu); @@ -1175,15 +1263,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; if (atomic_read(&buffer->record_disabled)) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1214,10 +1301,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return event; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return NULL; } @@ -1259,12 +1343,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, /* * Only the last preempt count needs to restore preemption. */ - if (preempt_count() == 1) { - if (per_cpu(rb_need_resched, cpu)) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); - } else + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else preempt_enable_no_resched_notrace(); return 0; @@ -1294,14 +1375,13 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; if (atomic_read(&buffer->record_disabled)) return -EBUSY; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1327,10 +1407,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return ret; } @@ -1489,14 +1566,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) return overruns; } -/** - * ring_buffer_iter_reset - reset an iterator - * @iter: The iterator to reset - * - * Resets the iterator, so that it will start from the beginning - * again. - */ -void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -1511,7 +1581,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -1597,7 +1684,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) unsigned long flags; int nr_loops = 0; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); again: /* @@ -1606,8 +1694,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ - if (unlikely(++nr_loops > 3)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } @@ -1619,8 +1706,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* Never should we have an index greater than the size */ - RB_WARN_ON(cpu_buffer, - cpu_buffer->reader_page->read > rb_page_size(reader)); + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; /* check if we caught up to the tail */ reader = NULL; @@ -1637,7 +1725,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ reader->list.prev->next = &cpu_buffer->reader_page->list; @@ -1659,7 +1747,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return reader; } @@ -1673,7 +1762,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ - BUG_ON(!reader); + if (RB_WARN_ON(cpu_buffer, !reader)) + return; event = rb_reader_event(cpu_buffer); @@ -1700,7 +1790,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - BUG_ON(iter->head_page == cpu_buffer->commit_page); + if (RB_WARN_ON(buffer, + iter->head_page == cpu_buffer->commit_page)) + return; rb_inc_iter(iter); return; } @@ -1713,8 +1805,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * This should not be called to advance the header if we are * at the tail of the buffer. */ - BUG_ON((iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer))); + if (RB_WARN_ON(cpu_buffer, + (iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer)))) + return; rb_update_iter_read_stamp(iter, event); @@ -1726,17 +1820,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } -/** - * ring_buffer_peek - peek at the next event to be read - * @buffer: The ring buffer to read - * @cpu: The cpu to peak at - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not consume the data. - */ -struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -1757,10 +1842,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -1798,16 +1881,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) return NULL; } -/** - * ring_buffer_iter_peek - peek at the next event to be read - * @iter: The ring buffer iterator - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not increment the iterator. - */ -struct ring_buffer_event * -ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; @@ -1829,10 +1904,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } if (rb_per_cpu_empty(cpu_buffer)) return NULL; @@ -1869,6 +1942,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } /** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_buffer_peek(buffer, cpu, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @@ -1879,19 +1997,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; + unsigned long flags; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; - event = ring_buffer_peek(buffer, cpu, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + event = rb_buffer_peek(buffer, cpu, ts); if (!event) - return NULL; + goto out; - cpu_buffer = buffer->buffers[cpu]; rb_advance_reader(cpu_buffer); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + return event; } @@ -1928,9 +2051,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock_irqsave(&cpu_buffer->lock, flags); - ring_buffer_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + __raw_spin_lock(&cpu_buffer->lock); + rb_iter_reset(iter); + __raw_spin_unlock(&cpu_buffer->lock); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; } @@ -1962,12 +2087,17 @@ struct ring_buffer_event * ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; - event = ring_buffer_iter_peek(iter, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); if (!event) - return NULL; + goto out; rb_advance_iter(iter); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -1987,7 +2117,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->commit, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -1996,7 +2126,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; @@ -2016,11 +2146,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - spin_lock_irqsave(&cpu_buffer->lock, flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -2090,8 +2224,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return -EINVAL; /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->size != buffer_b->size || - buffer_a->pages != buffer_b->pages) + if (buffer_a->pages != buffer_b->pages) return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; @@ -2118,16 +2251,178 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_data_page *bpage) +{ + struct ring_buffer_event *event; + unsigned long head; + + __raw_spin_lock(&cpu_buffer->lock); + for (head = 0; head < local_read(&bpage->commit); + head += rb_event_length(event)) { + + event = __rb_data_page_index(bpage, head); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->entries--; + } + __raw_spin_unlock(&cpu_buffer->lock); +} + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +{ + unsigned long addr; + struct buffer_data_page *bpage; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return NULL; + + bpage = (void *)addr; + + return bpage; +} + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_page(buffer); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * if (ret) + * process_page(rpage); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * 1 if data has been transferred + * 0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + unsigned long flags; + int ret = 0; + + if (!data_page) + return 0; + + bpage = *data_page; + if (!bpage) + return 0; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + /* + * rb_buffer_peek will get the next ring buffer if + * the current reader page is empty. + */ + event = rb_buffer_peek(buffer, cpu, NULL); + if (!event) + goto out; + + /* check for data */ + if (!local_read(&cpu_buffer->reader_page->page->commit)) + goto out; + /* + * If the writer is already off of the read page, then simply + * switch the read page with the given page. Otherwise + * we need to copy the data from the reader to the writer. + */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) { + unsigned int read = cpu_buffer->reader_page->read; + + if (full) + goto out; + /* The writer is still on the reader page, we must copy */ + bpage = cpu_buffer->reader_page->page; + memcpy(bpage->data, + cpu_buffer->reader_page->page->data + read, + local_read(&bpage->commit) - read); + + /* consume what was read */ + cpu_buffer->reader_page += read; + + } else { + /* swap the pages */ + rb_init_page(bpage); + bpage = cpu_buffer->reader_page->page; + cpu_buffer->reader_page->page = *data_page; + cpu_buffer->reader_page->read = 0; + *data_page = bpage; + } + ret = 1; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, bpage); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} + static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; int r; - /* !ring_buffers_off == tracing_on */ - r = sprintf(buf, "%d\n", !*p); + if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) + r = sprintf(buf, "permanently disabled\n"); + else + r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2136,7 +2431,7 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; long val; int ret; @@ -2153,8 +2448,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - /* !ring_buffers_off == tracing_on */ - *p = !val; + if (val) + set_bit(RB_BUFFERS_ON_BIT, p); + else + clear_bit(RB_BUFFERS_ON_BIT, p); (*ppos)++; @@ -2176,7 +2473,7 @@ static __init int rb_init_debugfs(void) d_tracer = tracing_init_dentry(); entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffers_off, &rb_simple_fops); + &ring_buffer_flags, &rb_simple_fops); if (!entry) pr_warning("Could not create debugfs 'tracing_on' entry\n"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ee9e4e454a..f4bb3800318 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/fs.h> #include <linux/kprobes.h> +#include <linux/seq_file.h> #include <linux/writeback.h> #include <linux/stacktrace.h> @@ -43,6 +44,38 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as ftrace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static bool __read_mostly tracing_selftest_running; + +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static struct tracer_flags dummy_tracer_flags = { + .val = 0, + .opts = dummy_tracer_opt +}; + +static int dummy_set_flag(u32 old_flags, u32 bit, int set) +{ + return 0; +} + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +int tracing_disabled = 1; + static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) @@ -62,7 +95,36 @@ static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int tracing_disabled = 1; +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops to true. + */ +int ftrace_dump_on_oops; + +static int tracing_set_tracer(char *buf); + +static int __init set_ftrace(char *str) +{ + tracing_set_tracer(str); + return 1; +} +__setup("ftrace", set_ftrace); + +static int __init set_ftrace_dump_on_oops(char *str) +{ + ftrace_dump_on_oops = 1; + return 1; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); long ns2usecs(cycle_t nsec) @@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; +/** + * tracing_is_enabled - return tracer_enabled status + * + * This function is used by other tracers to know the status + * of the tracer_enabled flag. Tracers may use this function + * to know if it should enable their features when starting + * up. See irqsoff tracer for an example (start_irqsoff_tracer). + */ +int tracing_is_enabled(void) +{ + return tracer_enabled; +} + /* function tracing enabled */ int ftrace_function_enabled; @@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock); /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -/* trace_flags holds iter_ctrl options */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/* trace_flags holds trace_options default values */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE; /** * trace_wake_up - wake up tasks waiting for trace input @@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -/* - * TRACE_ITER_SYM_MASK masks the options in trace_flags that - * control the output of kernel symbols. - */ -#define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) - /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { "print-parent", @@ -213,6 +282,12 @@ static const char *trace_options[] = { "stacktrace", "sched-tree", "ftrace_printk", + "ftrace_preempt", + "branch", + "annotate", + "userstacktrace", + "sym-userobj", + "printk-msg-only", NULL }; @@ -359,6 +434,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } +static int +trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->len >= (PAGE_SIZE - 1)) + return 0; + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + return 0; +} + static void trace_seq_reset(struct trace_seq *s) { @@ -470,7 +567,17 @@ int register_tracer(struct tracer *type) return -1; } + /* + * When this gets called we hold the BKL which means that + * preemption is disabled. Various trace selftests however + * need to disable and enable preemption for successful tests. + * So we drop the BKL here and grab it after the tests again. + */ + unlock_kernel(); mutex_lock(&trace_types_lock); + + tracing_selftest_running = true; + for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -481,12 +588,20 @@ int register_tracer(struct tracer *type) } } + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) + type->flags = &dummy_tracer_flags; + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int saved_ctrl = tr->ctrl; int i; + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -494,25 +609,23 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + current_trace = type; - tr->ctrl = 0; /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ current_trace = saved_tracer; - tr->ctrl = saved_ctrl; if (ret) { printk(KERN_CONT "FAILED!\n"); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + printk(KERN_CONT "PASSED\n"); } #endif @@ -524,7 +637,9 @@ int register_tracer(struct tracer *type) max_tracer_type_len = len; out: + tracing_selftest_running = false; mutex_unlock(&trace_types_lock); + lock_kernel(); return ret; } @@ -564,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu) ftrace_enable_cpu(); } +void tracing_reset_online_cpus(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; @@ -581,6 +706,91 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); + +/** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected. This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ + tracing_disabled = 1; + ftrace_stop(); + tracing_off_permanent(); +} + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + spin_lock_irqsave(&tracing_start_lock, flags); + if (--trace_stop_count) + goto out; + + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + goto out; + } + + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + ftrace_start(); + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + ftrace_stop(); + spin_lock_irqsave(&tracing_start_lock, flags); + if (trace_stop_count++) + goto out; + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + void trace_stop_cmdline_recording(void); static void trace_save_cmdline(struct task_struct *tsk) @@ -618,7 +828,7 @@ static void trace_save_cmdline(struct task_struct *tsk) spin_unlock(&trace_cmdline_lock); } -static char *trace_find_cmdline(int pid) +char *trace_find_cmdline(int pid) { char *cmdline = "<...>"; unsigned map; @@ -655,6 +865,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->tgid = (tsk) ? tsk->tgid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -691,6 +902,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void __trace_graph_entry(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_GRAPH_ENT; + entry->graph_ent = *trace; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} + +static void __trace_graph_return(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_GRAPH_RET; + entry->ret = *trace; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} +#endif + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -742,6 +1003,46 @@ void __trace_stack(struct trace_array *tr, ftrace_trace_stack(tr, data, flags, skip, preempt_count()); } +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, int pc) +{ +#ifdef CONFIG_STACKTRACE + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; + unsigned long irq_flags; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_USER_STACK; + + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = 0; + trace.entries = entry->caller; + + save_stack_trace_user(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif +} + +void __trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags) +{ + ftrace_trace_userstack(tr, data, flags, preempt_count()); +} + static void ftrace_trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3, @@ -765,6 +1066,7 @@ ftrace_trace_special(void *__tr, void *__data, entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, irq_flags, 4, pc); + ftrace_trace_userstack(tr, data, irq_flags, pc); trace_wake_up(); } @@ -803,6 +1105,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 5, pc); + ftrace_trace_userstack(tr, data, flags, pc); } void @@ -832,6 +1135,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 6, pc); + ftrace_trace_userstack(tr, data, flags, pc); trace_wake_up(); } @@ -841,26 +1145,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; + unsigned long flags; int cpu; int pc; - if (tracing_disabled || !tr->ctrl) + if (tracing_disabled) return; pc = preempt_count(); - preempt_disable_notrace(); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (likely(!atomic_read(&data->disabled))) + if (likely(atomic_inc_return(&data->disabled) == 1)) ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - preempt_enable_notrace(); + atomic_dec(&data->disabled); + local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_TRACER static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -873,8 +1179,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -884,12 +1189,97 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_entry(tr, data, trace, flags, pc); + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return 1; +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, data, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -898,9 +1288,14 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + register_ftrace_function(&trace_ops); - if (tracer_enabled) - ftrace_function_enabled = 1; + ftrace_function_enabled = 1; } void tracing_stop_function_trace(void) @@ -912,6 +1307,7 @@ void tracing_stop_function_trace(void) enum trace_file_type { TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, }; static void trace_iterator_increment(struct trace_iterator *iter, int cpu) @@ -1047,10 +1443,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) atomic_inc(&trace_record_cmdline_disabled); - /* let the tracer grab locks here if needed */ - if (current_trace->start) - current_trace->start(iter); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -1077,14 +1469,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); - - /* let the tracer release locks here if needed */ - if (current_trace && current_trace == iter->trace && iter->trace->stop) - iter->trace->stop(iter); - mutex_unlock(&trace_types_lock); } @@ -1143,7 +1528,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static int +int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { int ret; @@ -1164,6 +1549,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +static int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + int ret = 1; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->ent.tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (i && ret) + ret = trace_seq_puts(s, " <- "); + if (!ip) { + if (ret) + ret = trace_seq_puts(s, "??"); + continue; + } + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); + } + + if (mm) + mmput(mm); + return ret; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1301,6 +1758,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /* * The message is supposed to contain an ending newline. * If the printing stops prematurely, try to add a newline of our own. @@ -1338,6 +1802,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) trace_seq_putc(s, '\n'); } +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + + if (!(trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + + if (cpu_isset(iter->cpu, iter->started)) + return; + + cpu_set(iter->cpu, iter->started); + trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); +} + static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -1352,11 +1833,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) char *comm; int S, T; int i; - unsigned state; if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) next_ts = iter->ts; @@ -1396,12 +1878,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - - state = field->prev_state ? - __ffs(field->prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, @@ -1448,6 +1926,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_print_cont(s, iter); break; } + case TRACE_BRANCH: { + struct trace_branch *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + break; + } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + seq_print_userip_objs(field, s, sym_flags); + trace_seq_putc(s, '\n'); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1472,6 +1971,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + comm = trace_find_cmdline(iter->ent->pid); t = ns2usecs(iter->ts); @@ -1519,10 +2020,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->prev_pid, field->prev_prio, @@ -1581,6 +2080,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_seq_print_cont(s, iter); break; } + case TRACE_GRAPH_RET: { + return print_graph_function(iter); + } + case TRACE_GRAPH_ENT: { + return print_graph_function(iter); + } + case TRACE_BRANCH: { + struct trace_branch *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + break; + } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + ret = seq_print_userip_objs(field, s, sym_flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + break; + } } return TRACE_TYPE_HANDLED; } @@ -1621,12 +2151,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1640,6 +2167,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1712,12 +2240,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); @@ -1728,6 +2253,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1744,6 +2270,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1782,6 +2327,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -1823,6 +2369,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_printk_msg_only(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -1847,7 +2398,9 @@ static int s_show(struct seq_file *m, void *v) seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); } - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return 0; @@ -1899,6 +2452,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->tr->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = @@ -1917,13 +2479,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) m->private = iter; /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } - - if (iter->trace && iter->trace->open) - iter->trace->open(iter); + tracing_stop(); mutex_unlock(&trace_types_lock); @@ -1966,14 +2522,7 @@ int tracing_release(struct inode *inode, struct file *file) iter->trace->close(iter); /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) { - tracer_enabled = 1; - /* - * It is safe to enable function tracing even if it - * isn't used - */ - ftrace_function_enabled = 1; - } + tracing_start(); mutex_unlock(&trace_types_lock); seq_release(inode, file); @@ -2151,7 +2700,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - raw_local_irq_disable(); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* @@ -2168,7 +2717,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, } } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_enable(); + local_irq_enable(); tracing_cpumask = tracing_cpumask_new; @@ -2189,13 +2738,16 @@ static struct file_operations tracing_cpumask_fops = { }; static ssize_t -tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, +tracing_trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + int i; char *buf; int r = 0; int len = 0; - int i; + u32 tracer_flags = current_trace->flags->val; + struct tracer_opt *trace_opts = current_trace->flags->opts; + /* calulate max size */ for (i = 0; trace_options[i]; i++) { @@ -2203,6 +2755,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and space */ } + /* + * Increase the size with names of options specific + * of the current tracer. + */ + for (i = 0; trace_opts[i].name; i++) { + len += strlen(trace_opts[i].name); + len += 3; /* "no" and space */ + } + /* +2 for \n and \0 */ buf = kmalloc(len + 2, GFP_KERNEL); if (!buf) @@ -2215,6 +2776,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "no%s ", trace_options[i]); } + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + r += sprintf(buf + r, "%s ", + trace_opts[i].name); + else + r += sprintf(buf + r, "no%s ", + trace_opts[i].name); + } + r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); @@ -2225,13 +2795,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, return r; } +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +{ + struct tracer_flags *trace_flags = trace->flags; + struct tracer_opt *opts = NULL; + int ret = 0, i = 0; + int len; + + for (i = 0; trace_flags->opts[i].name; i++) { + opts = &trace_flags->opts[i]; + len = strlen(opts->name); + + if (strncmp(cmp, opts->name, len) == 0) { + ret = trace->set_flag(trace_flags->val, + opts->bit, !neg); + break; + } + } + /* Not found */ + if (!trace_flags->opts[i].name) + return -EINVAL; + + /* Refused to handle */ + if (ret) + return ret; + + if (neg) + trace_flags->val &= ~opts->bit; + else + trace_flags->val |= opts->bit; + + return 0; +} + static ssize_t -tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, +tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; char *cmp = buf; int neg = 0; + int ret; int i; if (cnt >= sizeof(buf)) @@ -2258,11 +2863,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, break; } } - /* - * If no option could be set, return an error: - */ - if (!trace_options[i]) - return -EINVAL; + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) { + ret = set_tracer_option(current_trace, cmp, neg); + if (ret) + return ret; + } filp->f_pos += cnt; @@ -2271,8 +2878,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, static struct file_operations tracing_iter_fops = { .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, + .read = tracing_trace_options_read, + .write = tracing_trace_options_write, }; static const char readme_msg[] = @@ -2286,9 +2893,9 @@ static const char readme_msg[] = "# echo sched_switch > /debug/tracing/current_tracer\n" "# cat /debug/tracing/current_tracer\n" "sched_switch\n" - "# cat /debug/tracing/iter_ctrl\n" + "# cat /debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" "echo 0 > /debug/tracing/tracing_enabled\n" @@ -2311,11 +2918,10 @@ static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; char buf[64]; int r; - r = sprintf(buf, "%ld\n", tr->ctrl); + r = sprintf(buf, "%u\n", tracer_enabled); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2343,16 +2949,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, val = !!val; mutex_lock(&trace_types_lock); - if (tr->ctrl ^ val) { - if (val) + if (tracer_enabled ^ val) { + if (val) { tracer_enabled = 1; - else + if (current_trace->start) + current_trace->start(tr); + tracing_start(); + } else { tracer_enabled = 0; - - tr->ctrl = val; - - if (current_trace && current_trace->ctrl_update) - current_trace->ctrl_update(tr); + tracing_stop(); + if (current_trace->stop) + current_trace->stop(tr); + } } mutex_unlock(&trace_types_lock); @@ -2378,29 +2986,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_set_tracer(char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; - char buf[max_tracer_type_len+1]; - int i; - size_t ret; - - ret = cnt; - - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + int ret = 0; mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { @@ -2414,18 +3004,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (t == current_trace) goto out; + trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); current_trace = t; - if (t->init) - t->init(tr); + if (t->init) { + ret = t->init(tr); + if (ret) + goto out; + } + trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); - if (ret > 0) - filp->f_pos += ret; + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+1]; + int i; + size_t ret; + int err; + + ret = cnt; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + err = tracing_set_tracer(buf); + if (err) + return err; + + filp->f_pos += ret; return ret; } @@ -2492,6 +3116,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return -ENOMEM; mutex_lock(&trace_types_lock); + + /* trace pipe does not show start of buffer */ + cpus_setall(iter->started); + iter->tr = &global_trace; iter->trace = current_trace; filp->private_data = iter; @@ -2667,7 +3295,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, char buf[64]; int r; - r = sprintf(buf, "%lu\n", tr->entries); + r = sprintf(buf, "%lu\n", tr->entries >> 10); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2678,7 +3306,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; int ret, cpu; - struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2698,12 +3325,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (tr->ctrl) { - cnt = -EBUSY; - pr_info("ftrace: please disable tracing" - " before modifying buffer size\n"); - goto out; - } + tracing_stop(); /* disable all cpu buffers */ for_each_tracing_cpu(cpu) { @@ -2713,6 +3335,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, atomic_inc(&max_tr.data[cpu]->disabled); } + /* value is in KB */ + val <<= 10; + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { @@ -2751,6 +3376,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, atomic_dec(&max_tr.data[cpu]->disabled); } + tracing_start(); max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); @@ -2762,7 +3388,7 @@ static int mark_printk(const char *fmt, ...) int ret; va_list args; va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); + ret = trace_vprintk(0, -1, fmt, args); va_end(args); return ret; } @@ -2773,9 +3399,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, { char *buf; char *end; - struct trace_array *tr = &global_trace; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -2841,22 +3466,38 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + static ssize_t -tracing_read_long(struct file *filp, char __user *ubuf, +tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char buf[64]; + char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); int r; - r = sprintf(buf, "%ld\n", *p); + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; } -static struct file_operations tracing_read_long_fops = { +static struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, - .read = tracing_read_long, + .read = tracing_read_dyn_info, }; #endif @@ -2897,10 +3538,10 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); - entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + entry = debugfs_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); if (!entry) - pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + pr_warning("Could not create debugfs 'trace_options' entry\n"); entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, NULL, &tracing_cpumask_fops); @@ -2950,11 +3591,11 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_pipe' entry\n"); - entry = debugfs_create_file("trace_entries", 0644, d_tracer, + entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'trace_entries' entry\n"); + "'buffer_size_kb' entry\n"); entry = debugfs_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); @@ -2965,7 +3606,7 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, - &tracing_read_long_fops); + &tracing_dyn_info_fops); if (!entry) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); @@ -2976,7 +3617,7 @@ static __init int tracer_init_debugfs(void) return 0; } -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; @@ -2984,11 +3625,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct print_entry *entry; - unsigned long flags, irq_flags; int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled || tracing_selftest_running) return 0; pc = preempt_count(); @@ -2999,7 +3640,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (unlikely(atomic_read(&data->disabled))) goto out; - spin_lock_irqsave(&trace_buf_lock, flags); + pause_graph_tracing(); + spin_lock_irqsave(&trace_buf_lock, irq_flags); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3010,17 +3652,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); + tracing_generic_entry_update(&entry->ent, irq_flags, pc); entry->ent.type = TRACE_PRINT; entry->ip = ip; + entry->depth = depth; memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); - + spin_unlock_irqrestore(&trace_buf_lock, irq_flags); + unpause_graph_tracing(); out: preempt_enable_notrace(); @@ -3037,7 +3680,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, fmt, ap); + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); va_end(ap); return ret; } @@ -3046,7 +3689,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk); static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); return NOTIFY_OK; } @@ -3062,7 +3706,8 @@ static int trace_die_handler(struct notifier_block *self, { switch (val) { case DIE_OOPS: - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); break; default: break; @@ -3103,7 +3748,6 @@ trace_printk_seq(struct trace_seq *s) trace_seq_reset(s); } - void ftrace_dump(void) { static DEFINE_SPINLOCK(ftrace_dump_lock); @@ -3128,6 +3772,9 @@ void ftrace_dump(void) atomic_inc(&global_trace.data[cpu]->disabled); } + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + printk(KERN_TRACE "Dumping ftrace buffer:\n"); iter.tr = &global_trace; @@ -3221,7 +3868,6 @@ __init static int tracer_alloc_buffers(void) #endif /* All seems OK, enable tracing */ - global_trace.ctrl = tracer_enabled; tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8465ad05270..cc7a4f86403 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -8,6 +8,7 @@ #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> #include <linux/ftrace.h> +#include <trace/boot.h> enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -21,7 +22,14 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, - TRACE_BOOT, + TRACE_BRANCH, + TRACE_BOOT_CALL, + TRACE_BOOT_RET, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_HW_BRANCHES, + TRACE_POWER, __TRACE_LAST_TYPE }; @@ -38,6 +46,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + int tgid; }; /* @@ -48,6 +57,18 @@ struct ftrace_entry { unsigned long ip; unsigned long parent_ip; }; + +/* Function call entry */ +struct ftrace_graph_ent_entry { + struct trace_entry ent; + struct ftrace_graph_ent graph_ent; +}; + +/* Function return entry */ +struct ftrace_graph_ret_entry { + struct trace_entry ent; + struct ftrace_graph_ret ret; +}; extern struct tracer boot_tracer; /* @@ -85,12 +106,18 @@ struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; }; +struct userstack_entry { + struct trace_entry ent; + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + /* * ftrace_printk entry: */ struct print_entry { struct trace_entry ent; unsigned long ip; + int depth; char buf[]; }; @@ -112,9 +139,35 @@ struct trace_mmiotrace_map { struct mmiotrace_map map; }; -struct trace_boot { +struct trace_boot_call { struct trace_entry ent; - struct boot_trace initcall; + struct boot_trace_call boot_call; +}; + +struct trace_boot_ret { + struct trace_entry ent; + struct boot_trace_ret boot_ret; +}; + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 +struct trace_branch { + struct trace_entry ent; + unsigned line; + char func[TRACE_FUNC_SIZE+1]; + char file[TRACE_FILE_SIZE+1]; + char correct; +}; + +struct hw_branch_entry { + struct trace_entry ent; + u64 from; + u64 to; +}; + +struct trace_power { + struct trace_entry ent; + struct power_trace state_data; }; /* @@ -172,7 +225,6 @@ struct trace_iterator; struct trace_array { struct ring_buffer *buffer; unsigned long entries; - long ctrl; int cpu; cycle_t time_start; struct task_struct *waiter; @@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ + IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ + IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ __ftrace_bad_type(); \ } while (0) @@ -229,29 +290,56 @@ enum print_line_t { TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ }; + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + /* * A specific tracer, represented by methods that operate on a trace array: */ struct tracer { const char *name; - void (*init)(struct trace_array *tr); + /* Your tracer should raise a warning if init fails */ + int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); - void (*start)(struct trace_iterator *iter); - void (*stop)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); - void (*ctrl_update)(struct trace_array *tr); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif + void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; int print_max; + struct tracer_flags *flags; }; struct trace_seq { @@ -279,10 +367,14 @@ struct trace_iterator { unsigned long iter_flags; loff_t pos; long idx; + + cpumask_t started; }; +int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); +void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); @@ -321,8 +413,15 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_return(struct ftrace_graph_ret *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); + void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_sched_switch_assign_trace(struct trace_array *tr); +void tracing_stop_sched_switch_record(void); +void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); @@ -358,6 +457,7 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; +char *trace_find_cmdline(int pid); #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE @@ -383,19 +483,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); -extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); extern unsigned long trace_flags; +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern enum print_line_t print_graph_function(struct trace_iterator *iter); + +#ifdef CONFIG_DYNAMIC_FTRACE +/* TODO: make this variable */ +#define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_count; +extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; + +static inline int ftrace_graph_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_count || test_tsk_trace_graph(current)) + return 1; + + for (i = 0; i < ftrace_graph_count; i++) { + if (addr == ftrace_graph_funcs[i]) + return 1; + } + + return 0; +} +#else +static inline int ftrace_trace_addr(unsigned long addr) +{ + return 1; +} +static inline int ftrace_graph_addr(unsigned long addr) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static inline enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +extern struct pid *ftrace_pid_trace; + +static inline int ftrace_trace_task(struct task_struct *task) +{ + if (!ftrace_pid_trace) + return 1; + + return test_tsk_trace_trace(task); +} + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -415,8 +575,93 @@ enum trace_iterator_flags { TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, + TRACE_ITER_PREEMPTONLY = 0x800, + TRACE_ITER_BRANCH = 0x1000, + TRACE_ITER_ANNOTATE = 0x2000, + TRACE_ITER_USERSTACKTRACE = 0x4000, + TRACE_ITER_SYM_USEROBJ = 0x8000, + TRACE_ITER_PRINTK_MSGONLY = 0x10000 }; +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + extern struct tracer nop_trace; +/** + * ftrace_preempt_disable - disable preemption scheduler safe + * + * When tracing can happen inside the scheduler, there exists + * cases that the tracing might happen before the need_resched + * flag is checked. If this happens and the tracer calls + * preempt_enable (after a disable), a schedule might take place + * causing an infinite recursion. + * + * To prevent this, we read the need_recshed flag before + * disabling preemption. When we want to enable preemption we + * check the flag, if it is set, then we call preempt_enable_no_resched. + * Otherwise, we call preempt_enable. + * + * The rational for doing the above is that if need resched is set + * and we have yet to reschedule, we are either in an atomic location + * (where we do not need to check for scheduling) or we are inside + * the scheduler and do not want to resched. + */ +static inline int ftrace_preempt_disable(void) +{ + int resched; + + resched = need_resched(); + preempt_disable_notrace(); + + return resched; +} + +/** + * ftrace_preempt_enable - enable preemption scheduler safe + * @resched: the return value from ftrace_preempt_disable + * + * This is a scheduler safe way to enable preemption and not miss + * any preemption checks. The disabled saved the state of preemption. + * If resched is set, then we were either inside an atomic or + * are inside the scheduler (we would have already scheduled + * otherwise). In this case, we do not want to call normal + * preempt_enable, but preempt_enable_no_resched instead. + */ +static inline void ftrace_preempt_enable(int resched) +{ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +#ifdef CONFIG_BRANCH_TRACER +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) +{ + if (trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); + return 0; +} +static inline void trace_branch_disable(void) +{ + /* due to races, always disable */ + disable_branch_tracing(); +} +#else +static inline int trace_branch_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_branch_disable(void) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d0a5e50eeff..3ccebde2848 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -13,101 +13,161 @@ #include "trace.h" static struct trace_array *boot_trace; -static int trace_boot_enabled; +static bool pre_initcalls_finished; - -/* Should be started after do_pre_smp_initcalls() in init/main.c */ +/* Tells the boot tracer that the pre_smp_initcalls are finished. + * So we are ready . + * It doesn't enable sched events tracing however. + * You have to call enable_boot_trace to do so. + */ void start_boot_trace(void) { - trace_boot_enabled = 1; + pre_initcalls_finished = true; } -void stop_boot_trace(void) +void enable_boot_trace(void) { - trace_boot_enabled = 0; + if (pre_initcalls_finished) + tracing_start_sched_switch_record(); } -void reset_boot_trace(struct trace_array *tr) +void disable_boot_trace(void) { - stop_boot_trace(); + if (pre_initcalls_finished) + tracing_stop_sched_switch_record(); } -static void boot_trace_init(struct trace_array *tr) +static int boot_trace_init(struct trace_array *tr) { int cpu; boot_trace = tr; - trace_boot_enabled = 0; - for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); + + tracing_sched_switch_assign_trace(tr); + return 0; } -static void boot_trace_ctrl_update(struct trace_array *tr) +static enum print_line_t +initcall_call_print_line(struct trace_iterator *iter) { - if (tr->ctrl) - start_boot_trace(); + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct trace_boot_call *field; + struct boot_trace_call *call; + u64 ts; + unsigned long nsec_rem; + int ret; + + trace_assign_type(field, entry); + call = &field->boot_call; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", + (unsigned long)ts, nsec_rem, call->func, call->caller); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; else - stop_boot_trace(); + return TRACE_TYPE_HANDLED; } -static enum print_line_t initcall_print_line(struct trace_iterator *iter) +static enum print_line_t +initcall_ret_print_line(struct trace_iterator *iter) { - int ret; struct trace_entry *entry = iter->ent; - struct trace_boot *field = (struct trace_boot *)entry; - struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; - struct timespec calltime = ktime_to_timespec(it->calltime); - struct timespec rettime = ktime_to_timespec(it->rettime); - - if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - calltime.tv_sec, - calltime.tv_nsec, - it->func, it->caller); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %lld msecs\n", - rettime.tv_sec, - rettime.tv_nsec, - it->func, it->result, it->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + struct trace_boot_ret *field; + struct boot_trace_ret *init_ret; + u64 ts; + unsigned long nsec_rem; + int ret; + + trace_assign_type(field, entry); + init_ret = &field->boot_ret; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " + "returned %d after %llu msecs\n", + (unsigned long) ts, + nsec_rem, + init_ret->func, init_ret->result, init_ret->duration); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + else return TRACE_TYPE_HANDLED; +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_BOOT_CALL: + return initcall_call_print_line(iter); + case TRACE_BOOT_RET: + return initcall_ret_print_line(iter); + default: + return TRACE_TYPE_UNHANDLED; } - return TRACE_TYPE_UNHANDLED; } struct tracer boot_tracer __read_mostly = { .name = "initcall", .init = boot_trace_init, - .reset = reset_boot_trace, - .ctrl_update = boot_trace_ctrl_update, + .reset = tracing_reset_online_cpus, .print_line = initcall_print_line, }; -void trace_boot(struct boot_trace *it, initcall_t fn) +void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { struct ring_buffer_event *event; - struct trace_boot *entry; - struct trace_array_cpu *data; + struct trace_boot_call *entry; unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!trace_boot_enabled) + if (!pre_initcalls_finished) return; /* Get its name now since this function could * disappear because it is in the .init section. */ - sprint_symbol(it->func, (unsigned long)fn); + sprint_symbol(bt->func, (unsigned long)fn); + preempt_disable(); + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_BOOT_CALL; + entry->boot_call = *bt; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} + +void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) +{ + struct ring_buffer_event *event; + struct trace_boot_ret *entry; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!pre_initcalls_finished) + return; + + sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - data = tr->data[smp_processor_id()]; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); @@ -115,8 +175,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_BOOT; - entry->initcall = *it; + entry->ent.type = TRACE_BOOT_RET; + entry->boot_ret = *bt; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 00000000000..6c00feb3bac --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,342 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/hash.h> +#include <linux/fs.h> +#include <asm/local.h> +#include "trace.h" + +#ifdef CONFIG_BRANCH_TRACER + +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + struct trace_array *tr = branch_tracer; + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags, irq_flags; + int cpu, pc; + const char *p; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + + pc = preempt_count(); + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_BRANCH; + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + int ret = 0; + + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return ret; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static int branch_trace_init(struct trace_array *tr) +{ + int cpu; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + start_branch_trace(tr); + return 0; +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif +}; + +__init static int init_branch_trace(void) +{ + return register_tracer(&branch_trace); +} + +device_initcall(init_branch_trace); +#else +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +{ + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +struct ftrace_pointer { + void *start; + void *stop; + int hit; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + const struct ftrace_pointer *f = m->private; + struct ftrace_branch_data *p = v; + + (*pos)++; + + if (v == (void *)1) + return f->start; + + ++p; + + if ((void *)p >= (void *)f->stop) + return NULL; + + return p; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = (void *)1; + loff_t l = 0; + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + const struct ftrace_pointer *fp = m->private; + struct ftrace_branch_data *p = v; + const char *f; + long percent; + + if (v == (void *)1) { + if (fp->hit) + seq_printf(m, " miss hit %% "); + else + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; + } + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + /* + * The miss is overlayed on correct, and hit on incorrect. + */ + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; + + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) + seq_printf(m, " X "); + else + seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static struct seq_operations tracing_likely_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int tracing_branch_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &tracing_likely_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)inode->i_private; + } + + return ret; +} + +static const struct file_operations tracing_branch_fops = { + .open = tracing_branch_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static const struct ftrace_pointer ftrace_branch_pos = { + .start = __start_branch_profile, + .stop = __stop_branch_profile, + .hit = 1, +}; + +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; + +static const struct ftrace_pointer ftrace_annotated_branch_pos = { + .start = __start_annotated_branch_profile, + .stop = __stop_annotated_branch_profile, +}; + +static __init int ftrace_branch_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, + (void *)&ftrace_annotated_branch_pos, + &tracing_branch_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'profile_annotatet_branch' entry\n"); + +#ifdef CONFIG_PROFILE_ALL_BRANCHES + entry = debugfs_create_file("profile_branch", 0444, d_tracer, + (void *)&ftrace_branch_pos, + &tracing_branch_fops); + if (!entry) + pr_warning("Could not create debugfs" + " 'profile_branch' entry\n"); +#endif + + return 0; +} + +device_initcall(ftrace_branch_init); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0f85a64003d..9236d7e25a1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,20 +16,10 @@ #include "trace.h" -static void function_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_function_trace(struct trace_array *tr) { tr->cpu = get_cpu(); - function_reset(tr); + tracing_reset_online_cpus(tr); put_cpu(); tracing_start_cmdline_record(); @@ -42,24 +32,20 @@ static void stop_function_trace(struct trace_array *tr) tracing_stop_cmdline_record(); } -static void function_trace_init(struct trace_array *tr) +static int function_trace_init(struct trace_array *tr) { - if (tr->ctrl) - start_function_trace(tr); + start_function_trace(tr); + return 0; } static void function_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_function_trace(tr); + stop_function_trace(tr); } -static void function_trace_ctrl_update(struct trace_array *tr) +static void function_trace_start(struct trace_array *tr) { - if (tr->ctrl) - start_function_trace(tr); - else - stop_function_trace(tr); + tracing_reset_online_cpus(tr); } static struct tracer function_trace __read_mostly = @@ -67,7 +53,7 @@ static struct tracer function_trace __read_mostly = .name = "function", .init = function_trace_init, .reset = function_trace_reset, - .ctrl_update = function_trace_ctrl_update, + .start = function_trace_start, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 00000000000..4bf39fcae97 --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,669 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace.h" + +#define TRACE_GRAPH_INDENT 2 + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 + +static struct tracer_opt trace_opts[] = { + /* Display overruns ? */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + /* Don't display overruns and proc by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, + .opts = trace_opts +}; + +/* pid on the last trace processed */ +static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; + +static int graph_trace_init(struct trace_array *tr) +{ + int cpu, ret; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; +} + +static void graph_trace_reset(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(); +} + +static inline int log10_cpu(int nb) +{ + if (nb / 100) + return 3; + if (nb / 10) + return 2; + return 1; +} + +static enum print_line_t +print_graph_cpu(struct trace_seq *s, int cpu) +{ + int i; + int ret; + int log10_this = log10_cpu(cpu); + int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + + + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + ret = trace_seq_printf(s, " "); + + /* + * Tricky - we space the CPU field according to the max + * number of online CPUs. On a 2-cpu system it would take + * a maximum of 1 digit - on a 128 cpu system it would + * take up to 3 digits: + */ + for (i = 0; i < log10_all - log10_this; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + ret = trace_seq_printf(s, "%d) ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static enum print_line_t +print_graph_proc(struct trace_seq *s, pid_t pid) +{ + int i; + int ret; + int len; + char comm[8]; + int spaces = 0; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + + strncpy(comm, trace_find_cmdline(pid), 7); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%s-%s", comm, pid_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + + +/* If the pid changed since the last trace, output this event */ +static enum print_line_t +verif_pid(struct trace_seq *s, pid_t pid, int cpu) +{ + pid_t prev_pid; + int ret; + + if (last_pid[cpu] != -1 && last_pid[cpu] == pid) + return TRACE_TYPE_HANDLED; + + prev_pid = last_pid[cpu]; + last_pid[cpu] = pid; + +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + return ret; +} + +static bool +trace_branch_is_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct ring_buffer_iter *ring_iter; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + ring_iter = iter->buffer_iter[iter->cpu]; + + if (!ring_iter) + return false; + + event = ring_buffer_iter_peek(ring_iter, NULL); + + if (!event) + return false; + + next = ring_buffer_event_data(event); + + if (next->ent.type != TRACE_GRAPH_RET) + return false; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return false; + + return true; +} + +static enum print_line_t +print_graph_irq(struct trace_seq *s, unsigned long addr, + enum trace_type type, int cpu, pid_t pid) +{ + int ret; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return TRACE_TYPE_UNHANDLED; + + if (type == TRACE_GRAPH_ENT) { + ret = trace_seq_printf(s, "==========> | "); + } else { + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "<========== |\n"); + } + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ + char msecs_str[21]; + char nsecs_str[5]; + int ret, len; + int i; + + sprintf(msecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + ret = trace_seq_printf(s, msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + len = strlen(msecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + ret = trace_seq_printf(s, ".%s", nsecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + len += strlen(nsecs_str); + } + + ret = trace_seq_printf(s, " us "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "| "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + +} + +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + + return trace_seq_printf(s, " "); +} + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, struct trace_seq *s) +{ + struct ftrace_graph_ret_entry *ret_entry; + struct ftrace_graph_ret *graph_ret; + struct ring_buffer_event *event; + struct ftrace_graph_ent *call; + unsigned long long duration; + int ret; + int i; + + event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + ret_entry = ring_buffer_event_data(event); + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + /* Overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "();\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, pid_t pid, int cpu) +{ + int i; + int ret; + struct ftrace_graph_ent *call = &entry->graph_ent; + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Interrupt */ + ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); + if (ret == TRACE_TYPE_UNHANDLED) { + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "() {\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, int cpu) +{ + int ret; + struct trace_entry *ent = iter->ent; + + /* Pid */ + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + if (trace_branch_is_leaf(iter, field)) + return print_graph_entry_leaf(iter, field, s); + else + return print_graph_entry_nested(field, s, iter->ent->pid, cpu); + +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, int cpu) +{ + int i; + int ret; + unsigned long long duration = trace->rettime - trace->calltime; + + /* Pid */ + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Overrun */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_comment(struct print_entry *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter) +{ + int i; + int ret; + + /* Pid */ + if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, iter->cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Indentation */ + if (trace->depth > 0) + for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* The comment */ + ret = trace_seq_printf(s, "/* %s", trace->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (ent->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + ret = trace_seq_printf(s, " */\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + struct ftrace_graph_ent_entry *field; + trace_assign_type(field, entry); + return print_graph_entry(field, s, iter, + iter->cpu); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry, iter->cpu); + } + case TRACE_PRINT: { + struct print_entry *field; + trace_assign_type(field, entry); + return print_graph_comment(field, s, entry, iter); + } + default: + return TRACE_TYPE_UNHANDLED; + } +} + +static void print_graph_headers(struct seq_file *s) +{ + /* 1st line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "CPU "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "TASK/PID "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) + seq_printf(s, "OVERHEAD/"); + seq_printf(s, "DURATION FUNCTION CALLS\n"); + + /* 2nd line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "| "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "| | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + seq_printf(s, "| "); + seq_printf(s, "| | | | |\n"); + } else + seq_printf(s, " | | | | |\n"); +} +static struct tracer graph_trace __read_mostly = { + .name = "function_graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, +}; + +static __init int init_graph_trace(void) +{ + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c new file mode 100644 index 00000000000..b6a3e20a49a --- /dev/null +++ b/kernel/trace/trace_hw_branches.c @@ -0,0 +1,195 @@ +/* + * h/w branch tracer for x86 based on bts + * + * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include <asm/ds.h> + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +static void bts_trace_start_cpu(void *arg) +{ + if (this_tracer) + ds_release_bts(this_tracer); + + this_tracer = + ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + BTS_KERNEL); + if (IS_ERR(this_tracer)) { + this_tracer = NULL; + return; + } +} + +static void bts_trace_start(struct trace_array *tr) +{ + int cpu; + + tracing_reset_online_cpus(tr); + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ + if (this_tracer) { + ds_release_bts(this_tracer); + this_tracer = NULL; + } +} + +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ + tracing_reset_online_cpus(tr); + bts_trace_start(tr); + + return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, + "# | | | |\n"); +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct hw_branch_entry *it; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_HW_BRANCHES) { + if (trace_seq_printf(seq, "%4d ", entry->cpu) && + trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", + it->from, it->to) && + (!it->from || + seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && + trace_seq_printf(seq, "\n")) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE;; + } + return TRACE_TYPE_UNHANDLED; +} + +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +{ + struct ring_buffer_event *event; + struct hw_branch_entry *entry; + unsigned long irq; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_HW_BRANCHES; + entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; + ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, + const struct bts_trace *trace, void *at) +{ + struct bts_struct bts; + int err = 0; + + WARN_ON_ONCE(!trace->read); + if (!trace->read) + return; + + err = trace->read(this_tracer, at, &bts); + if (err < 0) + return; + + switch (bts.qualifier) { + case BTS_BRANCH: + trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + break; + } +} + +static void trace_bts_cpu(void *arg) +{ + struct trace_array *tr = (struct trace_array *) arg; + const struct bts_trace *trace; + unsigned char *at; + + if (!this_tracer) + return; + + ds_suspend_bts(this_tracer); + trace = ds_read_bts(this_tracer); + if (!trace) + goto out; + + for (at = trace->ds.top; (void *)at < trace->ds.end; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + + for (at = trace->ds.begin; (void *)at < trace->ds.top; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + +out: + ds_resume_bts(this_tracer); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ + .name = "hw-branch-tracer", + .init = bts_trace_init, + .reset = bts_trace_stop, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, + .open = trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ + return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9c74071c10e..7c2e326bbc8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_irqsoff_tracer(struct trace_array *tr) { register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } } static void stop_irqsoff_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); } @@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - - if (tr->ctrl) - start_irqsoff_tracer(tr); + start_irqsoff_tracer(tr); } static void irqsoff_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_irqsoff_tracer(tr); + stop_irqsoff_tracer(tr); } -static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +static void irqsoff_tracer_start(struct trace_array *tr) { - if (tr->ctrl) - start_irqsoff_tracer(tr); - else - stop_irqsoff_tracer(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; + save_tracer_enabled = 0; } static void irqsoff_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_irqsoff_tracer(iter->tr); + tracer_enabled = 0; } static void irqsoff_tracer_close(struct trace_iterator *iter) { - if (iter->tr->ctrl) - start_irqsoff_tracer(iter->tr); + /* restart tracing */ + tracer_enabled = save_tracer_enabled; } #ifdef CONFIG_IRQSOFF_TRACER -static void irqsoff_tracer_init(struct trace_array *tr) +static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", .init = irqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly = #endif #ifdef CONFIG_PREEMPT_TRACER -static void preemptoff_tracer_init(struct trace_array *tr) +static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptoff_tracer __read_mostly = @@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly = .name = "preemptoff", .init = preemptoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly = #if defined(CONFIG_IRQSOFF_TRACER) && \ defined(CONFIG_PREEMPT_TRACER) -static void preemptirqsoff_tracer_init(struct trace_array *tr) +static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index e62cbf78eab..fffcb069f1d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -22,44 +22,35 @@ static unsigned long prev_overruns; static void mmio_reset_data(struct trace_array *tr) { - int cpu; - overrun_detected = false; prev_overruns = 0; - tr->time_start = ftrace_now(tr->cpu); - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); } -static void mmio_trace_init(struct trace_array *tr) +static int mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } + + mmio_reset_data(tr); + enable_mmiotrace(); + return 0; } static void mmio_trace_reset(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) - disable_mmiotrace(); + + disable_mmiotrace(); mmio_reset_data(tr); mmio_trace_array = NULL; } -static void mmio_trace_ctrl_update(struct trace_array *tr) +static void mmio_trace_start(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } else { - disable_mmiotrace(); - } + mmio_reset_data(tr); } static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) @@ -296,10 +287,10 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .start = mmio_trace_start, .pipe_open = mmio_pipe_open, .close = mmio_close, .read = mmio_read, - .ctrl_update = mmio_trace_ctrl_update, .print_line = mmio_print_line, }; @@ -371,5 +362,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map) int mmio_trace_printk(const char *fmt, va_list args) { - return trace_vprintk(0, fmt, args); + return trace_vprintk(0, -1, fmt, args); } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 4592b486251..b9767acd30a 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -12,6 +12,27 @@ #include "trace.h" +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + static struct trace_array *ctx_trace; static void start_nop_trace(struct trace_array *tr) @@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr) /* Nothing to do! */ } -static void nop_trace_init(struct trace_array *tr) +static int nop_trace_init(struct trace_array *tr) { int cpu; ctx_trace = tr; @@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) tracing_reset(tr, cpu); - if (tr->ctrl) - start_nop_trace(tr); + start_nop_trace(tr); + return 0; } static void nop_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_nop_trace(tr); + stop_nop_trace(tr); } -static void nop_trace_ctrl_update(struct trace_array *tr) +/* It only serves as a signal handler and a callback to + * accept or refuse tthe setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(u32 old_flags, u32 bit, int set) { - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_nop_trace(tr); - else - stop_nop_trace(tr); + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + "Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; } + struct tracer nop_trace __read_mostly = { .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .ctrl_update = nop_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif + .flags = &nop_flags, + .set_flag = nop_set_flag }; diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c new file mode 100644 index 00000000000..a7172a352f6 --- /dev/null +++ b/kernel/trace/trace_power.c @@ -0,0 +1,179 @@ +/* + * ring buffer based C-state tracer + * + * Arjan van de Ven <arjan@linux.intel.com> + * Copyright (C) 2008 Intel Corporation + * + * Much is borrowed from trace_boot.c which is + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> +#include <linux/module.h> + +#include "trace.h" + +static struct trace_array *power_trace; +static int __read_mostly trace_power_enabled; + + +static void start_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 1; +} + +static void stop_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 0; +} + + +static int power_trace_init(struct trace_array *tr) +{ + int cpu; + power_trace = tr; + + trace_power_enabled = 1; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t power_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_power *field ; + struct power_trace *it; + struct trace_seq *s = &iter->seq; + struct timespec stamp; + struct timespec duration; + + trace_assign_type(field, entry); + it = &field->state_data; + stamp = ktime_to_timespec(it->stamp); + duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); + + if (entry->type == TRACE_POWER) { + if (it->type == POWER_CSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu, + duration.tv_sec, + duration.tv_nsec); + if (it->type == POWER_PSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer power_tracer __read_mostly = +{ + .name = "power", + .init = power_trace_init, + .start = start_power_trace, + .stop = stop_power_trace, + .reset = stop_power_trace, + .print_line = power_print_line, +}; + +static int init_power_trace(void) +{ + return register_tracer(&power_tracer); +} +device_initcall(init_power_trace); + +void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int level) +{ + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); +} +EXPORT_SYMBOL_GPL(trace_power_start); + + +void trace_power_end(struct power_trace *it) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + preempt_disable(); + it->end = ktime_get(); + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_end); + +void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int level) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); + preempt_disable(); + it->end = it->stamp; + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_mark); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b8f56beb1a6..df175cb4564 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; -static atomic_t sched_ref; +static int sched_ref; +static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!atomic_read(&sched_ref)) + if (!sched_ref) return; tracing_record_cmdline(prev); @@ -48,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, } static void -probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; @@ -71,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) local_irq_restore(flags); } -static void sched_switch_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static int tracing_sched_register(void) { int ret; @@ -123,20 +114,18 @@ static void tracing_sched_unregister(void) static void tracing_start_sched_switch(void) { - long ref; - - ref = atomic_inc_return(&sched_ref); - if (ref == 1) + mutex_lock(&sched_register_mutex); + if (!(sched_ref++)) tracing_sched_register(); + mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(void) { - long ref; - - ref = atomic_dec_and_test(&sched_ref); - if (ref) + mutex_lock(&sched_register_mutex); + if (!(--sched_ref)) tracing_sched_unregister(); + mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) @@ -149,40 +138,86 @@ void tracing_stop_cmdline_record(void) tracing_stop_sched_switch(); } +/** + * tracing_start_sched_switch_record - start tracing context switches + * + * Turns on context switch tracing for a tracer. + */ +void tracing_start_sched_switch_record(void) +{ + if (unlikely(!ctx_trace)) { + WARN_ON(1); + return; + } + + tracing_start_sched_switch(); + + mutex_lock(&sched_register_mutex); + tracer_enabled++; + mutex_unlock(&sched_register_mutex); +} + +/** + * tracing_stop_sched_switch_record - start tracing context switches + * + * Turns off context switch tracing for a tracer. + */ +void tracing_stop_sched_switch_record(void) +{ + mutex_lock(&sched_register_mutex); + tracer_enabled--; + WARN_ON(tracer_enabled < 0); + mutex_unlock(&sched_register_mutex); + + tracing_stop_sched_switch(); +} + +/** + * tracing_sched_switch_assign_trace - assign a trace array for ctx switch + * @tr: trace array pointer to assign + * + * Some tracers might want to record the context switches in their + * trace. This function lets those tracers assign the trace array + * to use. + */ +void tracing_sched_switch_assign_trace(struct trace_array *tr) +{ + ctx_trace = tr; +} + static void start_sched_trace(struct trace_array *tr) { - sched_switch_reset(tr); - tracing_start_cmdline_record(); - tracer_enabled = 1; + tracing_reset_online_cpus(tr); + tracing_start_sched_switch_record(); } static void stop_sched_trace(struct trace_array *tr) { - tracer_enabled = 0; - tracing_stop_cmdline_record(); + tracing_stop_sched_switch_record(); } -static void sched_switch_trace_init(struct trace_array *tr) +static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; - - if (tr->ctrl) - start_sched_trace(tr); + start_sched_trace(tr); + return 0; } static void sched_switch_trace_reset(struct trace_array *tr) { - if (tr->ctrl) + if (sched_ref) stop_sched_trace(tr); } -static void sched_switch_trace_ctrl_update(struct trace_array *tr) +static void sched_switch_trace_start(struct trace_array *tr) { - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_sched_trace(tr); - else - stop_sched_trace(tr); + tracing_reset_online_cpus(tr); + tracing_start_sched_switch(); +} + +static void sched_switch_trace_stop(struct trace_array *tr) +{ + tracing_stop_sched_switch(); } static struct tracer sched_switch_trace __read_mostly = @@ -190,7 +225,8 @@ static struct tracer sched_switch_trace __read_mostly = .name = "sched_switch", .init = sched_switch_trace_init, .reset = sched_switch_trace_reset, - .ctrl_update = sched_switch_trace_ctrl_update, + .start = sched_switch_trace_start, + .stop = sched_switch_trace_stop, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sched_switch, #endif @@ -198,14 +234,7 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { - int ret = 0; - - if (atomic_read(&sched_ref)) - ret = tracing_sched_register(); - if (ret) { - pr_info("error registering scheduler trace\n"); - return ret; - } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); + diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3ae93f16b56..43586b689e3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); - /* - * To prevent recursion from the scheduler, if the - * resched flag was set before we entered, then - * don't reschedule. - */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = @@ -220,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(struct rq *rq, struct task_struct *p) +probe_wakeup(struct rq *rq, struct task_struct *p, int success) { int cpu = smp_processor_id(); unsigned long flags; @@ -271,6 +262,12 @@ out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_wakeup_tracer(struct trace_array *tr) { int ret; @@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr) register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } return; fail_deprobe_wake_new: @@ -321,49 +324,53 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); } -static void wakeup_tracer_init(struct trace_array *tr) +static int wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; - - if (tr->ctrl) - start_wakeup_tracer(tr); + start_wakeup_tracer(tr); + return 0; } static void wakeup_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) { - stop_wakeup_tracer(tr); - /* make sure we put back any tasks we are tracing */ - wakeup_reset(tr); - } + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); +} + +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; } -static void wakeup_tracer_ctrl_update(struct trace_array *tr) +static void wakeup_tracer_stop(struct trace_array *tr) { - if (tr->ctrl) - start_wakeup_tracer(tr); - else - stop_wakeup_tracer(tr); + tracer_enabled = 0; + save_tracer_enabled = 0; } static void wakeup_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_wakeup_tracer(iter->tr); + tracer_enabled = 0; } static void wakeup_tracer_close(struct trace_iterator *iter) { /* forget about any processes we were recording */ - if (iter->tr->ctrl) - start_wakeup_tracer(iter->tr); + if (save_tracer_enabled) { + wakeup_reset(iter->tr); + tracer_enabled = 1; + } } static struct tracer wakeup_tracer __read_mostly = @@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly = .name = "wakeup", .init = wakeup_tracer_init, .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, .open = wakeup_tracer_open, .close = wakeup_tracer_close, - .ctrl_update = wakeup_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 90bc752a758..88c8eb70f54 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_STACK: case TRACE_PRINT: case TRACE_SPECIAL: + case TRACE_BRANCH: return 1; } return 0; @@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int cpu, ret = 0; /* Don't allow flipping of max traces now */ - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); if (count) *count = cnt; @@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* Sleep for a 1/10 of a second */ msleep(100); @@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); /* we should only have one item */ if (!ret && count != 1) { @@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ret = -1; goto out; } + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 1; tracer_enabled = 1; - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable interrupts for a bit */ @@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) udelay(100); local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable preemption for a bit */ @@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) udelay(100); preempt_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* reset the max latency */ tracing_max_latency = 0; @@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); - if (ret) + if (ret) { + tracing_start(); goto out; + } ret = trace_test_buffer(&max_tr, &count); - if (ret) + if (ret) { + tracing_start(); goto out; + } if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + tracing_start(); goto out; } /* do the test by disabling interrupts first this time */ tracing_max_latency = 0; - tr->ctrl = 1; - trace->ctrl_update(tr); + tracing_start(); preempt_disable(); local_irq_disable(); udelay(100); @@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (ret) @@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * out: trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; return ret; @@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wait_for_completion(&isrt); /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; @@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) @@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; @@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return 0; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); return ret; } #endif /* CONFIG_SYSPROF_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3bdb44bde4b..d0871bc0aca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -10,6 +10,7 @@ #include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/module.h> +#include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> #include "trace.h" @@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock = static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; +static int last_stack_tracer_enabled; static inline void check_stack(void) { @@ -48,7 +53,7 @@ static inline void check_stack(void) if (!object_is_on_stack(&this_size)) return; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); /* a race could have already updated it */ @@ -78,6 +83,7 @@ static inline void check_stack(void) * on a new max, so it is far from a fast path. */ while (i < max_stack_trace.nr_entries) { + int found = 0; stack_dump_index[i] = this_size; p = start; @@ -86,17 +92,19 @@ static inline void check_stack(void) if (*p == stack_dump_trace[i]) { this_size = stack_dump_index[i++] = (top - p) * sizeof(unsigned long); + found = 1; /* Start the search from here */ start = p + 1; } } - i++; + if (!found) + i++; } out: __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static void @@ -107,8 +115,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -120,10 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = @@ -166,16 +170,16 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); *ptr = val; __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); return count; } -static struct file_operations stack_max_size_fops = { +static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, @@ -273,7 +277,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations stack_trace_seq_ops = { +static const struct seq_operations stack_trace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -289,12 +293,47 @@ static int stack_trace_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations stack_trace_fops = { +static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, }; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&stack_sysctl_mutex); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || + (last_stack_tracer_enabled == stack_tracer_enabled)) + goto out; + + last_stack_tracer_enabled = stack_tracer_enabled; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static __init int enable_stacktrace(char *str) +{ + stack_tracer_enabled = 1; + last_stack_tracer_enabled = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + static __init int stack_trace_init(void) { struct dentry *d_tracer; @@ -312,7 +351,8 @@ static __init int stack_trace_init(void) if (!entry) pr_warning("Could not create debugfs 'stack_trace' entry\n"); - register_ftrace_function(&trace_ops); + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); return 0; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 9587d3bcba5..01becf1f19f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -234,20 +234,10 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static void stack_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); - stack_reset(tr); + tracing_reset_online_cpus(tr); start_stack_timers(); tracer_enabled = 1; mutex_unlock(&sample_timer_lock); @@ -261,27 +251,17 @@ static void stop_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static void stack_trace_init(struct trace_array *tr) +static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; - if (tr->ctrl) - start_stack_trace(tr); + start_stack_trace(tr); + return 0; } static void stack_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_stack_trace(tr); -} - -static void stack_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_stack_trace(tr); - else - stop_stack_trace(tr); + stop_stack_trace(tr); } static struct tracer stack_trace __read_mostly = @@ -289,7 +269,6 @@ static struct tracer stack_trace __read_mostly = .name = "sysprof", .init = stack_trace_init, .reset = stack_trace_reset, - .ctrl_update = stack_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sysprof, #endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index af8c8566488..79602740bbb 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex); */ #define TRACEPOINT_HASH_BITS 6 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : @@ -54,40 +55,43 @@ struct tracepoint_entry { struct hlist_node hlist; void **funcs; int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; char name[0]; }; -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +struct tp_probes { + union { + struct rcu_head rcu; + struct list_head list; + } u; + void *probes[0]; +}; -static void free_old_closure(struct rcu_head *head) +static inline void *allocate_probes(int count) { - struct tracepoint_entry *entry = container_of(head, - struct tracepoint_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; + struct tp_probes *p = kmalloc(count * sizeof(void *) + + sizeof(struct tp_probes), GFP_KERNEL); + return p == NULL ? NULL : p->probes; } -static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +static void rcu_free_old_probes(struct rcu_head *head) { - if (!old) - return; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); + kfree(container_of(head, struct tp_probes, u.rcu)); +} + +static inline void release_probes(void *old) +{ + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + } } static void debug_print_probes(struct tracepoint_entry *entry) { int i; - if (!tracepoint_debug) + if (!tracepoint_debug || !entry->funcs) return; for (i = 0; entry->funcs[i]; i++) @@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(void *)); new[nr_probes] = probe; + new[nr_probes + 1] = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (!old) - return NULL; + return ERR_PTR(-ENOENT); debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ @@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i]; i++) if ((probe && old[i] != probe)) new[j++] = old[i]; + new[nr_probes - nr_del] = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; - e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name) * Remove the tracepoint from the tracepoint hash table. Must be called with * mutex_lock held. */ -static int remove_tracepoint(const char *name) +static inline void remove_tracepoint(struct tracepoint_entry *e) { - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->refcount) - return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu_sched has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); kfree(e); - return 0; } /* @@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry, static void disable_tracepoint(struct tracepoint *elem) { elem->state = 0; + rcu_assign_pointer(elem->funcs, NULL); } /** @@ -320,6 +303,23 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } +static void *tracepoint_add_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) + return entry; + } + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old) && !entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name @@ -330,44 +330,36 @@ static void tracepoint_update_probes(void) */ int tracepoint_probe_register(const char *name, void *probe) { - struct tracepoint_entry *entry; - int ret = 0; void *old; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) { - ret = PTR_ERR(entry); - goto end; - } - } - /* - * If we detect that a call_rcu_sched is pending for this tracepoint, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = tracepoint_entry_add_probe(entry, probe); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } + old = tracepoint_add_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - WARN_ON(!entry); - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); -end: - mutex_unlock(&tracepoints_mutex); - return ret; + release_probes(old); + return 0; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); +static void *tracepoint_remove_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) + return ERR_PTR(-ENOENT); + old = tracepoint_entry_remove_probe(entry, probe); + if (IS_ERR(old)) + return old; + if (!entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name @@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); */ int tracepoint_probe_unregister(const char *name, void *probe) { - struct tracepoint_entry *entry; void *old; - int ret = -ENOENT; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = tracepoint_entry_remove_probe(entry, probe); - if (!old) { - printk(KERN_WARNING "Warning: Trying to unregister a probe" - "that doesn't exist\n"); - goto end; - } + old = tracepoint_remove_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ + release_probes(old); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +static LIST_HEAD(old_probes); +static int need_update; + +static void tracepoint_add_old_probes(void *old) +{ + need_update = 1; + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + list_add(&tp_probes->u.list, &old_probes); + } +} + +/** + * tracepoint_probe_register_noupdate - register a probe but not connect + * @name: tracepoint name + * @probe: probe handler + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_register_noupdate(const char *name, void *probe) +{ + void *old; + mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); - remove_tracepoint(name); /* Ignore busy error message */ - ret = 0; -end: + old = tracepoint_add_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); mutex_unlock(&tracepoints_mutex); - return ret; + return 0; } -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); + +/** + * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect + * @name: tracepoint name + * @probe: probe function pointer + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); + +/** + * tracepoint_probe_update_all - update tracepoints + */ +void tracepoint_probe_update_all(void) +{ + LIST_HEAD(release_probes); + struct tp_probes *pos, *next; + + mutex_lock(&tracepoints_mutex); + if (!need_update) { + mutex_unlock(&tracepoints_mutex); + return; + } + if (!list_empty(&old_probes)) + list_replace_init(&old_probes, &release_probes); + need_update = 0; + mutex_unlock(&tracepoints_mutex); + + tracepoint_update_probes(); + list_for_each_entry_safe(pos, next, &release_probes, u.list) { + list_del(&pos->u.list); + call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); + } +} +EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); /** * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. @@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); + +#ifdef CONFIG_MODULES + +int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + case MODULE_STATE_GOING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + } + return 0; +} + +struct notifier_block tracepoint_module_nb = { + .notifier_call = tracepoint_module_notify, + .priority = 0, +}; + +static int init_tracepoints(void) +{ + return register_module_notifier(&tracepoint_module_nb); +} +__initcall(init_tracepoints); + +#endif /* CONFIG_MODULES */ |