From ee9c578527a93c66becb526c4a122c5358a959c5 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 20 Apr 2008 13:59:33 +0100 Subject: dyntick: Remove last reminants of dyntick support Remove the last reminants of dyntick support from the generic kernel. Signed-off-by: Russell King --- kernel/hrtimer.c | 2 +- kernel/sysctl.c | 12 ------------ kernel/timer.c | 10 +--------- 3 files changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..543d9ca9b4f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1078,7 +1078,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /** * hrtimer_get_next_event - get the time until next expiry event * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d7ffdc59816..76426d93007 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -131,8 +131,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -extern int sysctl_hz_timer; - #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; #endif @@ -561,16 +559,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#endif -#ifdef CONFIG_NO_IDLE_HZ - { - .ctl_name = KERN_HZ_TIMER, - .procname = "hz_timer", - .data = &sysctl_hz_timer, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #endif { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, diff --git a/kernel/timer.c b/kernel/timer.c index ceacc662657..ef3fa6906e8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. @@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) return cmp_next_hrtimer_event(now, expires); } - -#ifdef CONFIG_NO_IDLE_HZ -unsigned long next_timer_interrupt(void) -{ - return get_next_timer_interrupt(jiffies); -} -#endif - #endif #ifndef CONFIG_VIRT_CPU_ACCOUNTING -- cgit v1.2.3 From 493d35863dbb692c38c1415fd83d88dfb902ae37 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 14 May 2008 16:22:58 -0700 Subject: mutex-debug: check mutex magic before owner Currently, the mutex debug code checks the lock->owner before lock->magic, so a corrupt mutex will most likely result in failing the owner check, rather than the magic check. This change to debug_mutex_unlock does the magic check first, so we have a better idea of what breaks. Signed-off-by: Jeremy Kerr Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/mutex-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 3aaa06c561d..1d94160eb53 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock) if (unlikely(!debug_locks)) return; - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); } -- cgit v1.2.3 From 8b09dee67f484e9b42114b1a1f068e080fd7aa56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcupreempt: remove duplicate prototypes rcu_batches_completed and rcu_patches_completed_bh are both declared in rcuclassic.h and rcupreempt.h. This patch removes the extra prototypes for them from rcupdate.h. rcu_batches_completed_bh is defined as a static inline in the rcupreempt.h header file. Trying to export this as EXPORT_SYMBOL_GPL causes linking problems with the powerpc linker. There's no need to export a static inlined function. Modules must be compiled with the same type of RCU implementation as the kernel they are for. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a51..5e02b774070 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; -- cgit v1.2.3 From 4446a36ff8c74ac3b32feb009b651048e129c6af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add call_rcu_sched() Fourth cut of patch to provide the call_rcu_sched(). This is again to synchronize_sched() as call_rcu() is to synchronize_rcu(). Should be fine for experimental and -rt use, but not ready for inclusion. With some luck, I will be able to tell Andrew to come out of hiding on the next round. Passes multi-day rcutorture sessions with concurrent CPU hotplugging. Fixes since the first version include a bug that could result in indefinite blocking (spotted by Gautham Shenoy), better resiliency against CPU-hotplug operations, and other minor fixes. Fixes since the second version include reworking grace-period detection to avoid deadlocks that could happen when running concurrently with CPU hotplug, adding Mathieu's fix to avoid the softlockup messages, as well as Mathieu's fix to allow use earlier in boot. Fixes since the third version include a wrong-CPU bug spotted by Andrew, getting rid of the obsolete synchronize_kernel API that somehow snuck back in, merging spin_unlock() and local_irq_restore() in a few places, commenting the code that checks for quiescent states based on interrupting from user-mode execution or the idle loop, removing some inline attributes, and some code-style changes. Known/suspected shortcomings: o I still do not entirely trust the sleep/wakeup logic. Next step will be to use a private snapshot of the CPU online mask in rcu_sched_grace_period() -- if the CPU wasn't there at the start of the grace period, we don't need to hear from it. And the bit about accounting for changes in online CPUs inside of rcu_sched_grace_period() is ugly anyway. o It might be good for rcu_sched_grace_period() to invoke resched_cpu() when a given CPU wasn't responding quickly, but resched_cpu() is declared static... This patch also fixes a long-standing bug in the earlier preemptable-RCU implementation of synchronize_rcu() that could result in loss of concurrent external changes to a task's CPU affinity mask. I still cannot remember who reported this... Signed-off-by: Paul E. McKenney Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcupdate.c | 20 +-- kernel/rcupreempt.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 372 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c09605f8d16..a4e329d9288 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -39,18 +39,12 @@ #include #include #include -#include #include #include #include #include #include -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -60,7 +54,7 @@ static struct completion rcu_barrier_completion; * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -77,17 +71,7 @@ static void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); -} +synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 5e02b774070..aaa7976bd85 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -87,9 +88,14 @@ struct rcu_data { struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; + struct rcu_head *donelist; /* from waitlist & waitschedlist */ struct rcu_head **donetail; long rcu_flipctr[2]; + struct rcu_head *nextschedlist; + struct rcu_head **nextschedtail; + struct rcu_head *waitschedlist; + struct rcu_head **waitschedtail; + int rcu_sched_sleeping; #ifdef CONFIG_RCU_TRACE struct rcupreempt_trace trace; #endif /* #ifdef CONFIG_RCU_TRACE */ @@ -131,11 +137,24 @@ enum rcu_try_flip_states { rcu_try_flip_waitmb_state, }; +/* + * States for rcu_ctrlblk.rcu_sched_sleep. + */ + +enum rcu_sched_sleep_states { + rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ + rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ + rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ +}; + struct rcu_ctrlblk { spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ + spinlock_t schedlock; /* Protect rcu_sched sleep state. */ + enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ }; static DEFINE_PER_CPU(struct rcu_data, rcu_data); @@ -143,8 +162,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), + .sched_sleep = rcu_sched_not_sleeping, + .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), }; +static struct task_struct *rcu_sched_grace_period_task; #ifdef CONFIG_RCU_TRACE static char *rcu_try_flip_state_names[] = @@ -207,6 +230,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) */ #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); +#define RCU_SCHED_BATCH_TIME (HZ / 50) + /* * Return the number of RCU batches processed thus far. Useful * for debug and statistics. @@ -411,32 +436,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } -#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { + .dynticks = 1, +}; -DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; -static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +#ifdef CONFIG_NO_HZ static DEFINE_PER_CPU(int, rcu_update_flag); /** * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. * * If the CPU was idle with dynamic ticks active, this updates the - * dynticks_progress_counter to let the RCU handling know that the + * rcu_dyntick_sched.dynticks to let the RCU handling know that the * CPU is active. */ void rcu_irq_enter(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); if (per_cpu(rcu_update_flag, cpu)) per_cpu(rcu_update_flag, cpu)++; /* * Only update if we are coming from a stopped ticks mode - * (dynticks_progress_counter is even). + * (rcu_dyntick_sched.dynticks is even). */ if (!in_interrupt() && - (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + (rdssp->dynticks & 0x1) == 0) { /* * The following might seem like we could have a race * with NMI/SMIs. But this really isn't a problem. @@ -459,12 +486,12 @@ void rcu_irq_enter(void) * RCU read-side critical sections on this CPU would * have already completed. */ - per_cpu(dynticks_progress_counter, cpu)++; + rdssp->dynticks++; /* * The following memory barrier ensures that any * rcu_read_lock() primitives in the irq handler * are seen by other CPUs to follow the above - * increment to dynticks_progress_counter. This is + * increment to rcu_dyntick_sched.dynticks. This is * required in order for other CPUs to correctly * determine when it is safe to advance the RCU * grace-period state machine. @@ -472,7 +499,7 @@ void rcu_irq_enter(void) smp_mb(); /* see above block comment. */ /* * Since we can't determine the dynamic tick mode from - * the dynticks_progress_counter after this routine, + * the rcu_dyntick_sched.dynticks after this routine, * we use a second flag to acknowledge that we came * from an idle state with ticks stopped. */ @@ -480,7 +507,7 @@ void rcu_irq_enter(void) /* * If we take an NMI/SMI now, they will also increment * the rcu_update_flag, and will not update the - * dynticks_progress_counter on exit. That is for + * rcu_dyntick_sched.dynticks on exit. That is for * this IRQ to do. */ } @@ -490,12 +517,13 @@ void rcu_irq_enter(void) * rcu_irq_exit - Called from exiting Hard irq context. * * If the CPU was idle with dynamic ticks active, update the - * dynticks_progress_counter to put let the RCU handling be + * rcu_dyntick_sched.dynticks to put let the RCU handling be * aware that the CPU is going back to idle with no ticks. */ void rcu_irq_exit(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); /* * rcu_update_flag is set if we interrupted the CPU @@ -503,7 +531,7 @@ void rcu_irq_exit(void) * Once this occurs, we keep track of interrupt nesting * because a NMI/SMI could also come in, and we still * only want the IRQ that started the increment of the - * dynticks_progress_counter to be the one that modifies + * rcu_dyntick_sched.dynticks to be the one that modifies * it on exit. */ if (per_cpu(rcu_update_flag, cpu)) { @@ -515,28 +543,29 @@ void rcu_irq_exit(void) /* * If an NMI/SMI happens now we are still - * protected by the dynticks_progress_counter being odd. + * protected by the rcu_dyntick_sched.dynticks being odd. */ /* * The following memory barrier ensures that any * rcu_read_unlock() primitives in the irq handler * are seen by other CPUs to preceed the following - * increment to dynticks_progress_counter. This + * increment to rcu_dyntick_sched.dynticks. This * is required in order for other CPUs to determine * when it is safe to advance the RCU grace-period * state machine. */ smp_mb(); /* see above block comment. */ - per_cpu(dynticks_progress_counter, cpu)++; - WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + rdssp->dynticks++; + WARN_ON(rdssp->dynticks & 0x1); } } static void dyntick_save_progress_counter(int cpu) { - per_cpu(rcu_dyntick_snapshot, cpu) = - per_cpu(dynticks_progress_counter, cpu); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->dynticks_snap = rdssp->dynticks; } static inline int @@ -544,9 +573,10 @@ rcu_try_flip_waitack_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -580,9 +610,10 @@ rcu_try_flip_waitmb_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -609,14 +640,86 @@ rcu_try_flip_waitmb_needed(int cpu) return 1; } +static void dyntick_save_progress_counter_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_dynticks_snap = rdssp->dynticks; +} + +static int rcu_qsctr_inc_needed_dyntick(int cpu) +{ + long curr; + long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + curr = rdssp->dynticks; + snap = rdssp->sched_dynticks_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. Therefore, this CPU has been in a quiescent + * state the entire time, and we don't need to wait for it. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, this CPU has already + * passed through a quiescent state. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + #else /* !CONFIG_NO_HZ */ -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +# define dyntick_save_progress_counter_sched(cpu) do { } while (0) +# define rcu_qsctr_inc_needed_dyntick(cpu) (1) #endif /* CONFIG_NO_HZ */ +static void save_qsctr_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_qs_snap = rdssp->sched_qs; +} + +static inline int rcu_qsctr_inc_needed(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + /* + * If there has been a quiescent state, no more need to wait + * on this CPU. + */ + + if (rdssp->sched_qs != rdssp->sched_qs_snap) { + smp_mb(); /* force ordering with cpu entering schedule(). */ + return 0; + } + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -819,6 +922,26 @@ void rcu_check_callbacks(int cpu, int user) unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + /* + * If this CPU took its interrupt from user mode or from the + * idle loop, and this is not a nested interrupt, then + * this CPU has to have exited all prior preept-disable + * sections of code. So increment the counter to note this. + * + * The memory barrier is needed to handle the case where + * writes from a preempt-disable section of code get reordered + * into schedule() by this CPU's write buffer. So the memory + * barrier makes sure that the rcu_qsctr_inc() is seen by other + * CPUs to happen after any such write. + */ + + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + smp_mb(); /* Guard against aggressive schedule(). */ + rcu_qsctr_inc(cpu); + } + rcu_check_mb(cpu); if (rcu_ctrlblk.completed == rdp->completed) rcu_try_flip(); @@ -869,6 +992,8 @@ void rcu_offline_cpu(int cpu) struct rcu_head *list = NULL; unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_head *schedlist = NULL; + struct rcu_head **schedtail = &schedlist; struct rcu_head **tail = &list; /* @@ -882,6 +1007,11 @@ void rcu_offline_cpu(int cpu) rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], list, tail); rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); + rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, + schedlist, schedtail); + rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, + schedlist, schedtail); + rdp->rcu_sched_sleeping = 0; spin_unlock_irqrestore(&rdp->lock, flags); rdp->waitlistcount = 0; @@ -916,22 +1046,40 @@ void rcu_offline_cpu(int cpu) * fix. */ - local_irq_save(flags); + local_irq_save(flags); /* disable preempt till we know what lock. */ rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; + *rdp->nextschedtail = schedlist; + if (schedlist) + rdp->nextschedtail = schedtail; spin_unlock_irqrestore(&rdp->lock, flags); } void __devinit rcu_online_cpu(int cpu) { unsigned long flags; + struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); cpu_set(cpu, rcu_cpu_online_map); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + + /* + * The rcu_sched grace-period processing might have bypassed + * this CPU, given that it was not in the rcu_cpu_online_map + * when the grace-period scan started. This means that the + * grace-period task might sleep. So make sure that if this + * should happen, the first callback posted to this CPU will + * wake up the grace-period task if need be. + */ + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + rdp->rcu_sched_sleeping = 1; + spin_unlock_irqrestore(&rdp->lock, flags); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -986,31 +1134,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) *rdp->nexttail = head; rdp->nexttail = &head->next; RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock(&rdp->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&rdp->lock, flags); } EXPORT_SYMBOL_GPL(call_rcu); +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + int wake_gp = 0; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); + *rdp->nextschedtail = head; + rdp->nextschedtail = &head->next; + if (rdp->rcu_sched_sleeping) { + + /* Grace-period processing might be sleeping... */ + + rdp->rcu_sched_sleeping = 0; + wake_gp = 1; + } + spin_unlock_irqrestore(&rdp->lock, flags); + if (wake_gp) { + + /* Wake up grace-period processing, unless someone beat us. */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) + wake_gp = 0; + rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + if (wake_gp) + wake_up_interruptible(&rcu_ctrlblk.sched_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + /* * Wait until all currently running preempt_disable() code segments * (including hardware-irq-disable segments) complete. Note that * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -void __synchronize_sched(void) +synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +EXPORT_SYMBOL_GPL(__synchronize_sched); + +/* + * kthread function that manages call_rcu_sched grace periods. + */ +static int rcu_sched_grace_period(void *arg) { - cpumask_t oldmask; + int couldsleep; /* might sleep after current pass. */ + int couldsleepnext = 0; /* might sleep after next pass. */ int cpu; + unsigned long flags; + struct rcu_data *rdp; + int ret; - if (sched_getaffinity(0, &oldmask) < 0) - oldmask = cpu_possible_map; - for_each_online_cpu(cpu) { - sched_setaffinity(0, &cpumask_of_cpu(cpu)); - schedule(); - } - sched_setaffinity(0, &oldmask); + /* + * Each pass through the following loop handles one + * rcu_sched grace period cycle. + */ + do { + /* Save each CPU's current state. */ + + for_each_online_cpu(cpu) { + dyntick_save_progress_counter_sched(cpu); + save_qsctr_sched(cpu); + } + + /* + * Sleep for about an RCU grace-period's worth to + * allow better batching and to consume less CPU. + */ + schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); + + /* + * If there was nothing to do last time, prepare to + * sleep at the end of the current grace period cycle. + */ + couldsleep = couldsleepnext; + couldsleepnext = 1; + if (couldsleep) { + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + } + + /* + * Wait on each CPU in turn to have either visited + * a quiescent state or been in dynticks-idle mode. + */ + for_each_online_cpu(cpu) { + while (rcu_qsctr_inc_needed(cpu) && + rcu_qsctr_inc_needed_dyntick(cpu)) { + /* resched_cpu(cpu); @@@ */ + schedule_timeout_interruptible(1); + } + } + + /* Advance callbacks for each CPU. */ + + for_each_online_cpu(cpu) { + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + + /* + * We are running on this CPU irq-disabled, so no + * CPU can go offline until we re-enable irqs. + * The current CPU might have already gone + * offline (between the for_each_offline_cpu and + * the spin_lock_irqsave), but in that case all its + * callback lists will be empty, so no harm done. + * + * Advance the callbacks! We share normal RCU's + * donelist, since callbacks are invoked the + * same way in either case. + */ + if (rdp->waitschedlist != NULL) { + *rdp->donetail = rdp->waitschedlist; + rdp->donetail = rdp->waitschedtail; + + /* + * Next rcu_check_callbacks() will + * do the required raise_softirq(). + */ + } + if (rdp->nextschedlist != NULL) { + rdp->waitschedlist = rdp->nextschedlist; + rdp->waitschedtail = rdp->nextschedtail; + couldsleep = 0; + couldsleepnext = 0; + } else { + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + } + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + + /* Mark sleep intention. */ + + rdp->rcu_sched_sleeping = couldsleep; + + spin_unlock_irqrestore(&rdp->lock, flags); + } + + /* If we saw callbacks on the last scan, go deal with them. */ + + if (!couldsleep) + continue; + + /* Attempt to block... */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { + + /* + * Someone posted a callback after we scanned. + * Go take care of it. + */ + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + couldsleepnext = 0; + continue; + } + + /* Block until the next person posts a callback. */ + + rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + ret = 0; + __wait_event_interruptible(rcu_ctrlblk.sched_wq, + rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, + ret); + + /* + * Signals would prevent us from sleeping, and we cannot + * do much with them in any case. So flush them. + */ + if (ret) + flush_signals(current); + couldsleepnext = 0; + + } while (!kthread_should_stop()); + + return (0); } -EXPORT_SYMBOL_GPL(__synchronize_sched); /* * Check to see if any future RCU-related work will need to be done @@ -1027,7 +1340,9 @@ int rcu_needs_cpu(int cpu) return (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL); + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL); } int rcu_pending(int cpu) @@ -1038,7 +1353,9 @@ int rcu_pending(int cpu) if (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL) + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL) return 1; /* The RCU core needs an acknowledgement from this CPU. */ @@ -1105,6 +1422,11 @@ void __init __rcu_init(void) rdp->donetail = &rdp->donelist; rdp->rcu_flipctr[0] = 0; rdp->rcu_flipctr[1] = 0; + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + rdp->rcu_sched_sleeping = 0; } register_cpu_notifier(&rcu_nb); @@ -1127,11 +1449,15 @@ void __init __rcu_init(void) } /* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + * Late-boot-time RCU initialization that must wait until after scheduler + * has been initialized. */ -void synchronize_kernel(void) +void __init rcu_init_sched(void) { - synchronize_rcu(); + rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, + NULL, + "rcu_sched_grace_period"); + WARN_ON(IS_ERR(rcu_sched_grace_period_task)); } #ifdef CONFIG_RCU_TRACE -- cgit v1.2.3 From 8db559b83009bed92e1b5dd13a651ff273d9ff62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add memory barriers and comments to rcu_check_callbacks() Add comments to the logic that infers quiescent states when interrupting from either user mode or the idle loop. Also add a memory barrier: it appears that James Huang was in fact onto something, as the scheduler is much less synchronization happy than it once was, so we can no longer rely on its memory barriers in all cases. Signed-off-by: Paul E. McKenney Reported-by: James Huang Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcuclassic.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..d8348792f9f 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user) if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so count it. + * + * Also do a memory barrier. This is needed to handle + * the case where writes from a preempt-disable section + * of code get reordered into schedule() by this CPU's + * write buffer. The memory barrier makes sure that + * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see + * by other CPUs to happen after any such write. + */ + + smp_mb(); /* See above block comment. */ rcu_qsctr_inc(cpu); rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so count it. The memory barrier + * is needed for the same reason as is the above one. + */ + + smp_mb(); /* See above block comment. */ rcu_bh_qsctr_inc(cpu); + } raise_rcu_softirq(); } -- cgit v1.2.3 From 70f12f848d3e981479b4f6f751e73c14f7c13e5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add rcu_barrier_sched() and rcu_barrier_bh() Add rcu_barrier_sched() and rcu_barrier_bh(). With these in place, rcutorture no longer gives the occasional oops when repeatedly starting and stopping torturing rcu_bh. Also adds the API needed to flush out pre-existing call_rcu_sched() callbacks. Signed-off-by: Paul E. McKenney Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcupdate.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a4e329d9288..4a74b8d48d9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,12 @@ #include #include +enum rcu_barrier { + RCU_BARRIER_STD, + RCU_BARRIER_BH, + RCU_BARRIER_SCHED, +}; + static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -83,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *notused) +static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); atomic_inc(&rcu_barrier_cpu_count); - call_rcu(head, rcu_barrier_callback); + switch ((enum rcu_barrier)type) { + case RCU_BARRIER_STD: + call_rcu(head, rcu_barrier_callback); + break; + case RCU_BARRIER_BH: + call_rcu_bh(head, rcu_barrier_callback); + break; + case RCU_BARRIER_SCHED: + call_rcu_sched(head, rcu_barrier_callback); + break; + } } -/** - * rcu_barrier - Wait until all the in-flight RCUs are complete. +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. */ -void rcu_barrier(void) +static void _rcu_barrier(enum rcu_barrier type) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -111,13 +128,39 @@ void rcu_barrier(void) * until all the callbacks are queued. */ rcu_read_lock(); - on_each_cpu(rcu_barrier_func, NULL, 0, 1); + on_each_cpu(rcu_barrier_func, (void *)type, 0, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(RCU_BARRIER_STD); +} EXPORT_SYMBOL_GPL(rcu_barrier); +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(RCU_BARRIER_BH); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(RCU_BARRIER_SCHED); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + void __init rcu_init(void) { __rcu_init(); -- cgit v1.2.3 From 2326974df29988181b6b69ed6fbf42b17adf916f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add call_rcu_sched() and friends to rcutorture Add entry to rcu_torture_ops allowing the correct barrier function to be used upon exit from rcutorture. Also add torture options for the new call_rcu_sched() API. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcutorture.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 33acc424667..0334b6a8bac 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -192,6 +192,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); void (*sync)(void); + void (*cb_barrier)(void); int (*stats)(char *page); char *name; }; @@ -265,6 +266,7 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, .stats = NULL, .name = "rcu" }; @@ -304,6 +306,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = NULL, .stats = NULL, .name = "rcu_sync" }; @@ -364,6 +367,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, .stats = NULL, .name = "rcu_bh" }; @@ -377,6 +381,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, .stats = NULL, .name = "rcu_bh_sync" }; @@ -458,6 +463,7 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" }; @@ -482,6 +488,11 @@ static int sched_torture_completed(void) return 0; } +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + static void sched_torture_synchronize(void) { synchronize_sched(); @@ -494,12 +505,27 @@ static struct rcu_torture_ops sched_ops = { .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, + .deferredfree = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, .stats = NULL, .name = "sched" }; +static struct rcu_torture_ops sched_ops_sync = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -848,7 +874,9 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ - rcu_barrier(); + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -868,7 +896,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &sched_ops, }; + &srcu_ops, &sched_ops, &sched_ops_sync, }; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { -- cgit v1.2.3 From 82524746c27fa418c250a56dd7606b9d3fc79826 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: split list.h and move rcu-protected lists into rculist.h Move rcu-protected lists from list.h into a new header file rculist.h. This is done because list are a very used primitive structure all over the kernel and it's currently impossible to include other header files in this list.h without creating some circular dependencies. For example, list.h implements rcu-protected list and uses rcu_dereference() without including rcupdate.h. It actually compiles because users of rcu_dereference() are macros. Others RCU functions could be used too but aren't probably because of this. Therefore this patch creates rculist.h which includes rcupdates without to many changes/troubles. Signed-off-by: Franck Bui-Huu Acked-by: Paul E. McKenney Acked-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 20d59fa2d49..30bd5d4b2ac 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From d7c0651390b6a03ad53f99faec0ba88109d7191d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:06 +0200 Subject: rcu: fix rcu_try_flip_waitack_needed() to prevent grace-period stall The comment was correct -- need to make the code match the comment. Without this patch, if a CPU goes dynticks idle (and stays there forever) in just the right phase of preemptible-RCU grace-period processing, grace periods stall. The offending sequence of events (courtesy of Promela/spin, at least after I got the liveness criterion coded correctly...) is as follows: o CPU 0 is in dynticks-idle mode. Its dynticks_progress_counter is (say) 10. o CPU 0 takes an interrupt, so rcu_irq_enter() increments CPU 0's dynticks_progress_counter to 11. o CPU 1 is doing RCU grace-period processing in rcu_try_flip_idle(), sees rcu_pending(), so invokes dyntick_save_progress_counter(), which in turn takes a snapshot of CPU 0's dynticks_progress_counter into CPU 0's rcu_dyntick_snapshot -- now set to 11. CPU 1 then updates the RCU grace-period state to rcu_try_flip_waitack(). o CPU 0 returns from its interrupt, so rcu_irq_exit() increments CPU 0's dynticks_progress_counter to 12. o CPU 1 later invokes rcu_try_flip_waitack(), which notices that CPU 0 has not yet responded, and hence in turn invokes rcu_try_flip_waitack_needed(). This function examines the state of CPU 0's dynticks_progress_counter and rcu_dyntick_snapshot variables, which it copies to curr (== 12) and snap (== 11), respectively. Because curr!=snap, the first condition fails. Because curr-snap is only 1 and snap is odd, the second condition fails. rcu_try_flip_waitack_needed() therefore incorrectly concludes that it must wait for CPU 0 to explicitly acknowledge the counter flip. o CPU 0 remains forever in dynticks-idle mode, never taking any more hardware interrupts or any NMIs, and never running any more tasks. (Of course, -something- will usually eventually happen, which might be why we haven't seen this one in the wild. Still should be fixed!) Therefore the grace period never ends. Fix is to make the code match the comment, as shown below. With this fix, the above scenario would be satisfied with curr being even, and allow the grace period to proceed. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Josh Triplett Cc: Dipankar Sarma Cc: Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index aaa7976bd85..27d0748d8d3 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -597,7 +597,7 @@ rcu_try_flip_waitack_needed(int cpu) * that this CPU already acknowledged the counter. */ - if ((curr - snap) > 2 || (snap & 0x1) == 0) + if ((curr - snap) > 2 || (curr & 0x1) == 0) return 0; /* We need this CPU to explicitly acknowledge the counter flip. */ -- cgit v1.2.3 From e19a98967f49cb63352b3db4818983ea2cec24ba Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 14 May 2008 16:23:00 -0700 Subject: rcu: remove duplicated include in kernel/rcupreempt_trace.c Removed duplicated include file in kernel/rcupreempt_trace.c Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 49ac4947af2..5edf82c34bb 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 247ab1a80595100bd7ea61d174f8b7b7e5f8d4bb Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 14 May 2008 16:23:00 -0700 Subject: rcu: remove duplicated include in kernel/rcupreempt.c Removed duplicated include file in kernel/rcupreempt.c. Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27d0748d8d3..b8e4cdae4e8 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:41 +0200 Subject: sched: add latency tracer callbacks to the scheduler add 3 lightweight callbacks to the tracer backend. zero impact if tracing is turned off. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a9153..463dcdb36ef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ + ftrace_wake_up_task(p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } + ftrace_wake_up_new_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + ftrace_ctx_switch(prev, next); mm = next->mm; oldmm = prev->active_mm; /* -- cgit v1.2.3 From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: make the task state char-string visible to all The tracer wants to be able to convert the state number into a user visible character. This patch pulls that conversion string out the scheduler into the header. This way if it were to ever change, other parts of the kernel will know. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 463dcdb36ef..73e60085236 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5729,7 +5729,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { -- cgit v1.2.3 From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add basic support for gcc profiler instrumentation If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is set to a non-zero value the ftrace routine will be called everytime we enter a kernel function that is not marked with the "notrace" attribute. The ftrace routine will then call a registered function if a function happens to be registered. [ This code has been highly hacked by Steven Rostedt and Ingo Molnar, so don't blame Arnaldo for all of this ;-) ] Update: It is now possible to register more than one ftrace function. If only one ftrace function is registered, that will be the function that ftrace calls directly. If more than one function is registered, then ftrace will call a function that will loop through the functions to call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/trace/Kconfig | 5 ++ kernel/trace/Makefile | 3 ++ kernel/trace/ftrace.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 kernel/trace/Kconfig create mode 100644 kernel/trace/Makefile create mode 100644 kernel/trace/ftrace.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..fa05f6d8bdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 00000000000..8185c91417b --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,5 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 00000000000..bf4fd215a6a --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FTRACE) += libftrace.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 00000000000..b6a80b98a3f --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,138 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include + +static DEFINE_SPINLOCK(ftrace_func_lock); +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +/* mcount is defined per arch in assembly */ +EXPORT_SYMBOL(mcount); + +notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + + spin_lock_irqsave(&ftrace_func_lock, flags); + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + struct ftrace_ops **p; + int ret = 0; + + spin_lock_irqsave(&ftrace_func_lock, flags); + + /* + * If we are the only function, then the ftrace pointer is + * pointing directly to that function. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + /* If we only have one func left, then call that directly */ + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + + out: + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} -- cgit v1.2.3 From bc0c38d139ec7fcd5c030aea16b008f3732e42ac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: latency tracer infrastructure This patch adds the latency tracer infrastructure. This patch does not add anything that will select and turn it on, but will be used by later patches. If it were to be compiled, it would add the following files to the debugfs: The root tracing directory: /debugfs/tracing/ This patch also adds the following files: available_tracers list of available tracers. Currently no tracers are available. Looking into this file only shows "none" which is used to unregister all tracers. current_tracer The trace that is currently active. Empty on start up. To switch to a tracer simply echo one of the tracers that are listed in available_tracers: example: (used with later patches) echo function > /debugfs/tracing/current_tracer To disable the tracer: echo disable > /debugfs/tracing/current_tracer tracing_enabled echoing "1" into this file starts the ftrace function tracing (if sysctl kernel.ftrace_enabled=1) echoing "0" turns it off. latency_trace This file is readonly and holds the result of the trace. trace This file outputs a easier to read version of the trace. iter_ctrl Controls the way the output of traces look. So far there's two controls: echoing in "symonly" will only show the kallsyms variables without the addresses (if kallsyms was configured) echoing in "verbose" will change the output to show a lot more data, but not very easy to understand by humans. echoing in "nosymonly" turns off symonly. echoing in "noverbose" turns off verbose. Signed-off-by: Steven Rostedt Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/trace/Kconfig | 5 + kernel/trace/Makefile | 2 + kernel/trace/trace.c | 1547 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 184 ++++++ 5 files changed, 1739 insertions(+) create mode 100644 kernel/trace/trace.c create mode 100644 kernel/trace/trace.h (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index fa05f6d8bdb..7e344e7b0cb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8185c91417b..ce70677afbf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,3 +3,8 @@ # config HAVE_FTRACE bool + +config TRACING + bool + select DEBUG_FS + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bf4fd215a6a..7af40317525 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_TRACING) += trace.o + libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 00000000000..1b8eca7650d --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,1547 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static long notrace +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +static atomic_t tracer_counter; +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +static int tracer_enabled; +static unsigned long trace_nr_entries = 4096UL; + +static struct tracer *trace_types __read_mostly; +static struct tracer *current_trace __read_mostly; +static int max_tracer_type_len; + +static DEFINE_MUTEX(trace_types_lock); + +static int __init set_nr_entries(char *str) +{ + if (!str) + return 0; + trace_nr_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("trace_entries=", set_nr_entries); + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, +}; + +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions above */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + NULL +}; + +static unsigned trace_flags; + + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void notrace +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +notrace void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + void *save_trace; + int i; + + /* clear out all the previous traces */ + for_each_possible_cpu(i) { + data = tr->data[i]; + save_trace = max_tr.data[i]->trace; + memcpy(max_tr.data[i], data, sizeof(*data)); + data->trace = save_trace; + } + + __update_max_tr(tr, tsk, cpu); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + */ +notrace void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + void *save_trace; + int i; + + for_each_possible_cpu(i) + tracing_reset(max_tr.data[i]); + + save_trace = max_tr.data[cpu]->trace; + memcpy(max_tr.data[cpu], data, sizeof(*data)); + data->trace = save_trace; + + __update_max_tr(tr, tsk, cpu); +} + +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void notrace tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + atomic_set(&data->underrun, 0); +} + +#ifdef CONFIG_FTRACE +static void notrace +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; +#endif + +notrace void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +notrace void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_SPINLOCK(trace_cmdline_lock); +atomic_t trace_record_cmdline_disabled; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +notrace void trace_stop_cmdline_recording(void); + +static void notrace trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; + if (idx >= SAVED_CMDLINES) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static notrace char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return ""; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +notrace void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline notrace struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + idx = data->trace_idx; + idx_next = idx + 1; + + if (unlikely(idx_next >= tr->entries)) { + atomic_inc(&data->underrun); + idx_next = 0; + } + + data->trace_idx = idx_next; + + if (unlikely(idx_next != 0 && atomic_read(&data->underrun))) + atomic_inc(&data->underrun); + + entry = data->trace + idx * TRACE_ENTRY_SIZE; + + return entry; +} + +static inline notrace void +tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags) +{ + struct task_struct *tsk = current; + unsigned long pc; + + pc = preempt_count(); + + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +notrace void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; +} + +notrace void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; +} + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu) +{ + struct trace_entry *array = tr->data[cpu]->trace; + unsigned long underrun; + + if (idx >= tr->entries) + return NULL; + + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) + idx = ((underrun - 1) + idx) % tr->entries; + else if (idx >= tr->data[cpu]->trace_idx) + return NULL; + + return &array[idx]; +} + +static struct notrace trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_possible_cpu(cpu) { + if (!tr->data[cpu]->trace) + continue; + ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu); + if (ent && + (!next || (long)(next->idx - ent->idx) > 0)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + if (next) { + iter->next_idx[next_cpu]++; + iter->idx++; + } + iter->ent = next; + iter->cpu = next_cpu; + + return next ? iter : NULL; +} + +static void notrace * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *ent; + void *last_ent = iter->ent; + int i = (int)*pos; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) + return NULL; + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + for (i = 0; i < NR_CPUS; i++) + iter->next_idx[i] = 0; + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +static void +seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + seq_printf(m, fmt, str); +#endif +} + +static void +seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + seq_printf(m, fmt, str); +#endif +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static void notrace +seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) +{ + if (!ip) { + seq_printf(m, "0"); + return; + } + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + seq_print_sym_offset(m, "%s", ip); + else + seq_print_sym_short(m, "%s", ip); + + if (sym_flags & TRACE_ITER_SYM_ADDR) + seq_printf(m, " <" IP_FMT ">", ip); +} + +static void notrace print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void notrace print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void notrace +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long underruns = 0; + unsigned long underrun; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_possible_cpu(cpu) { + if (tr->data[cpu]->trace) { + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) { + underruns += underrun; + entries += tr->entries; + } else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + data->saved_latency, + entries, + (entries + underruns), + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(m, data->critical_start, sym_flags); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(m, data->critical_end, sym_flags); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +static void notrace +lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + seq_printf(m, "%8.8s-%-5d ", comm, entry->pid); + seq_printf(m, "%d", cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void notrace +lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static void notrace +print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, + unsigned int trace_idx, int cpu) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + char *comm; + int S; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + } else { + lat_print_generic(m, entry, cpu); + lat_print_timestamp(m, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + seq_puts(m, " ("); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + seq_puts(m, ")\n"); + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); + seq_printf(m, " %d:%d:%c --> %d:%d %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + comm); + break; + } +} + +static void notrace +print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry = iter->ent; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + char *comm; + int S; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + seq_printf(m, "%16s-%-5d ", comm, entry->pid); + seq_printf(m, "[%02d] ", iter->cpu); + seq_printf(m, "%5lu.%06lu: ", secs, usec_rem); + + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + seq_printf(m, " <-"); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + } + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + seq_printf(m, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + break; + } + seq_printf(m, "\n"); +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (data->trace && + (data->trace_idx || + atomic_read(&data->underrun))) + return 0; + } + return 1; +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_lat_fmt(m, iter, iter->idx, iter->cpu); + else + print_trace_fmt(m, iter); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator notrace * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > 64) + r = 64; + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + *ptr = val * 1000; + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +} + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = +{ + .name = "none", +}; + +static inline notrace int page_order(const unsigned long size) +{ + const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); + return ilog2(roundup_pow_of_two(nr_pages)); +} + +__init static int tracer_alloc_buffers(void) +{ + const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE); + const unsigned long size = (1UL << order) << PAGE_SHIFT; + struct trace_entry *array; + int i; + + for_each_possible_cpu(i) { + global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + global_trace.data[i]->trace = array; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "wakeup tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + max_tr.data[i]->trace = array; +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = size / TRACE_ENTRY_SIZE; + max_tr.entries = global_trace.entries; + + pr_info("tracer: %ld bytes allocated for %ld", + size, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct trace_array_cpu *data = global_trace.data[i]; + + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } +#endif + } + return -ENOMEM; +} + +device_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 00000000000..3173a93561d --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,184 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + unsigned long idx; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + void *trace; + unsigned long trace_idx; + atomic_t disabled; + atomic_t underrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + void (*ctrl_update)(struct trace_array *tr); + struct tracer *next; + int print_max; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + struct trace_entry *ent; + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + int cpu; + int idx; +}; + +void notrace tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +static inline notrace cycle_t now(int cpu) +{ + return cpu_clock(cpu); +} + +#ifdef CONFIG_SCHED_TRACER +extern void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +extern int register_tracer_switch(struct tracer_switch_ops *ops); +extern int unregister_tracer_switch(struct tracer_switch_ops *ops); + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#endif + +#endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3 From 1b29b01887e6032dcaf818c14999c7a39593b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: function tracer This is a simple trace that uses the ftrace infrastructure. It is designed to be fast and small, and easy to use. It is useful to record things that happen over a very short period of time, and not to analyze the system in general. Updates: available_tracers "function" is added to this file. current_tracer To enable the function tracer: echo function > /debugfs/tracing/current_tracer To disable the tracer: echo disable > /debugfs/tracing/current_tracer The output of the function_trace file is as follows "echo noverbose > /debugfs/tracing/iter_ctrl" preemption latency trace v1.1.5 on 2.6.24-rc7-tst Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ----------------- | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 0d.h. 1595128us+: set_normalized_timespec+0x8/0x2d (ktime_get_ts+0x4a/0x4e ) swapper-0 0d.h. 1595131us+: _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) Or with verbose turned on: "echo verbose > /debugfs/tracing/iter_ctrl" preemption latency trace v1.1.5 on 2.6.24-rc7-tst -------------------------------------------------------------------- latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ----------------- | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- swapper 0 0 9 00000000 00000000 [f3675f41] 1595.128ms (+0.003ms): set_normalized_timespec+0x8/0x2d (ktime_get_ts+0x4a/0x4e ) swapper 0 0 9 00000000 00000001 [f3675f45] 1595.131ms (+0.003ms): _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) swapper 0 0 9 00000000 00000002 [f3675f48] 1595.135ms (+0.003ms): _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) The "trace" file is not affected by the verbose mode, but is by the symonly. echo "nosymonly" > /debugfs/tracing/iter_ctrl tracer: [ 81.479967] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <-- _spin_unlock_irqrestore+0xe/0x5a [ 81.479967] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <-- sub_preempt_count+0xc/0x7a [ 81.479968] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <-- in_lock_functions+0x9/0x24 [ 81.479968] CPU 0: bash:3154 vfs_write+0x11d/0x155 <-- dnotify_parent+0x12/0x78 [ 81.479968] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <-- _spin_lock+0xe/0x70 [ 81.479969] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <-- add_preempt_count+0xe/0x77 [ 81.479969] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <-- in_lock_functions+0x9/0x24 echo "symonly" > /debugfs/tracing/iter_ctrl tracer: [ 81.479913] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <-- _spin_unlock_irqrestore+0xe/0x5a [ 81.479913] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <-- sub_preempt_count+0xc/0x7a [ 81.479913] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <-- in_lock_functions+0x9/0x24 [ 81.479914] CPU 0: bash:3154 vfs_write+0x11d/0x155 <-- dnotify_parent+0x12/0x78 [ 81.479914] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <-- _spin_lock+0xe/0x70 [ 81.479914] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <-- add_preempt_count+0xe/0x77 [ 81.479914] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <-- in_lock_functions+0x9/0x24 Signed-off-by: Steven Rostedt Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 13 ++++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_functions.c | 73 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 kernel/trace/trace_functions.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ce70677afbf..1399f372b5d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -8,3 +8,16 @@ config TRACING bool select DEBUG_FS +config FTRACE + bool "Kernel Function Tracer" + depends on DEBUG_KERNEL && HAVE_FTRACE + select FRAME_POINTER + select TRACING + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction to the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7af40317525..6bb5e50b4a4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_FTRACE) += trace_functions.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 00000000000..82988c5336e --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,73 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include + +#include "trace.h" + +static notrace void function_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_function_trace(struct trace_array *tr) +{ + function_reset(tr); + tracing_start_function_trace(); +} + +static notrace void stop_function_trace(struct trace_array *tr) +{ + tracing_stop_function_trace(); +} + +static notrace void function_trace_init(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); +} + +static notrace void function_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_function_trace(tr); +} + +static notrace void function_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); + else + stop_function_trace(tr); +} + +static struct tracer function_trace __read_mostly = +{ + .name = "ftrace", + .init = function_trace_init, + .reset = function_trace_reset, + .ctrl_update = function_trace_ctrl_update, +}; + +static __init int init_function_trace(void) +{ + return register_tracer(&function_trace); +} + +device_initcall(init_function_trace); -- cgit v1.2.3 From 35e8e302e5d6e32675df2fc1dd3a53dfa6630dc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add tracing of context switches This patch adds context switch tracing, of the format of: _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | pid:prio:state \ / ||||| \ | / swapper-0 1d..3 137us+: 0:140:R --> 2912:120 sshd-2912 1d..3 216us+: 2912:120:S --> 0:140 swapper-0 1d..3 261us+: 0:140:R --> 2912:120 bash-2920 0d..3 267us+: 2920:120:S --> 0:140 sshd-2912 1d..3 330us!: 2912:120:S --> 0:140 swapper-0 1d..3 2389us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 2411us!: 2847:120:S --> 0:140 swapper-0 0d..3 11089us+: 0:140:R --> 3139:120 gdm-bina-3139 0d..3 11113us!: 3139:120:S --> 0:140 swapper-0 1d..3 102328us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 102348us!: 2847:120:S --> 0:140 "sched_switch" is added to /debugfs/tracing/available_tracers [ Eugene Teo Cc: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 11 ++++ kernel/trace/Makefile | 1 + kernel/trace/trace_sched_switch.c | 116 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 kernel/trace/trace_sched_switch.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1399f372b5d..5d6aa92866c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -13,6 +13,7 @@ config FTRACE depends on DEBUG_KERNEL && HAVE_FTRACE select FRAME_POINTER select TRACING + select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -21,3 +22,13 @@ config FTRACE tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on DEBUG_KERNEL + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6bb5e50b4a4..6b54ceb7f16 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 00000000000..3e4771d3b89 --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,116 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; + +static void notrace +ctx_switch_func(struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +{ + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + ctx_switch_func(prev, next); + + /* + * Chain to the wakeup tracer (this is a NOP if disabled): + */ + wakeup_sched_switch(prev, next); +} + +static notrace void sched_switch_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_sched_trace(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracer_enabled = 1; +} + +static notrace void stop_sched_trace(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void sched_switch_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_sched_trace(tr); +} + +static notrace void sched_switch_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_sched_trace(tr); +} + +static void sched_switch_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_sched_trace(tr); + else + stop_sched_trace(tr); +} + +static struct tracer sched_switch_trace __read_mostly = +{ + .name = "sched_switch", + .init = sched_switch_trace_init, + .reset = sched_switch_trace_reset, + .ctrl_update = sched_switch_trace_ctrl_update, +}; + +__init static int init_sched_switch_trace(void) +{ + return register_tracer(&sched_switch_trace); +} +device_initcall(init_sched_switch_trace); -- cgit v1.2.3 From 352ad25aa4a189c667cb2af333948d34692a2d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: tracer for scheduler wakeup latency This patch adds the tracer that tracks the wakeup latency of the highest priority waking task. "wakeup" is added to /debugfs/tracing/available_tracers Also added to /debugfs/tracing tracing_max_latency holds the current max latency for the wakeup wakeup_thresh if set to other than zero, a log will be recorded for every wakeup that takes longer than the number entered in here (usecs for all counters) (deletes previous trace) Examples: (with ftrace_enabled = 0) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / quilt-8551 0d..3 0us+: wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) quilt-8551 0d..4 26us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ (with ftrace_enabled = 1) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 -------------------------------------------------------------------- latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / bash-10653 1d..3 0us : wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) bash-10653 1d..3 1us : try_to_wake_up+0x271/0x2e7 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..2 2us : try_to_wake_up+0x296/0x2e7 (update_rq_clock+0x9/0x20 ) bash-10653 1d..2 2us : update_rq_clock+0x1e/0x20 (__update_rq_clock+0xc/0x90 ) bash-10653 1d..2 3us : __update_rq_clock+0x1b/0x90 (sched_clock+0x9/0x29 ) bash-10653 1d..2 4us : try_to_wake_up+0x2a6/0x2e7 (activate_task+0xc/0x3f ) bash-10653 1d..2 4us : activate_task+0x2d/0x3f (enqueue_task+0xe/0x66 ) bash-10653 1d..2 5us : enqueue_task+0x5b/0x66 (enqueue_task_rt+0x9/0x3c ) bash-10653 1d..2 6us : try_to_wake_up+0x2ba/0x2e7 (check_preempt_wakeup+0x12/0x99 ) [...] bash-10653 1d..5 33us : tracing_record_cmdline+0xcf/0xd4 (_spin_unlock+0x9/0x33 ) bash-10653 1d..5 34us : _spin_unlock+0x19/0x33 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..4 35us : wakeup_sched_switch+0x65/0x2ff (_spin_lock_irqsave+0xc/0xa9 ) bash-10653 1d..4 35us : _spin_lock_irqsave+0x19/0xa9 (add_preempt_count+0xe/0x77 ) bash-10653 1d..4 36us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ The [...] was added here to not waste your email box space. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 13 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_sched_wakeup.c | 310 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 324 insertions(+) create mode 100644 kernel/trace/trace_sched_wakeup.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d6aa92866c..892ecc94a82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -4,6 +4,9 @@ config HAVE_FTRACE bool +config TRACER_MAX_TRACE + bool + config TRACING bool select DEBUG_FS @@ -23,6 +26,16 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6b54ceb7f16..5508cdb19ae 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,5 +3,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 00000000000..7c3ccefcf4c --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,310 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static DEFINE_SPINLOCK(wakeup_lock); + +static void notrace __wakeup_reset(struct trace_array *tr); + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waitng for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + } else { + printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " + "wakeup latency.\n => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + cpu, latency, t0); + } + + printk(KERN_CONT " ended at timestamp %lu: ", t1); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); + +out_unlock: + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static void notrace __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + assert_spin_locked(&wakeup_lock); + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void notrace wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_lock, flags); + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static notrace void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = now(cpu); + ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); + +out_locked: + spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +notrace void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +notrace void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +static notrace void start_wakeup_tracer(struct trace_array *tr) +{ + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + + return; +} + +static notrace void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static notrace void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void notrace wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void notrace wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); -- cgit v1.2.3 From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace irq disabled critical timings This patch adds latency tracing for critical timings (how long interrupts are disabled for). "irqsoff" is added to /debugfs/tracing/available_tracers Note: tracing_max_latency also holds the max latency for irqsoff (in usecs). (default to large number so one must start latency tracing) tracing_thresh threshold (in usecs) to always print out if irqs off is detected to be longer than stated here. If irq_thresh is non-zero, then max_irq_latency is ignored. Here's an example of a trace with ftrace_enabled = 0 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1d.s3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1d.s3 100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1d.s3 100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= And this is a trace with ftrace_enabled == 1 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 -------------------------------------------------------------------- latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1dNs3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000]) swapper-0 1dNs3 46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000]) swapper-0 1dNs3 47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000]) swapper-0 1dNs3 47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47) swapper-0 1dNs3 97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50) swapper-0 1dNs3 98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000]) swapper-0 1dNs3 99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000]) swapper-0 1dNs3 101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/fork.c | 2 +- kernel/lockdep.c | 23 ++- kernel/printk.c | 2 + kernel/trace/Kconfig | 18 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 443 insertions(+), 5 deletions(-) create mode 100644 kernel/trace/trace_irqsoff.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..d66d676dc36 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f08..e21924365ea 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void notrace trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void notrace trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..ae7d5b9e535 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1041,7 +1041,9 @@ void release_console_sem(void) _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); + start_critical_timings(); local_irq_restore(flags); } console_locked = 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 892ecc94a82..896df1cf6ad 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -26,6 +26,24 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5508cdb19ae..46be8647fb6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 00000000000..a9131b0cf1a --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,402 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void notrace +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (likely(!tracer_enabled)) + return; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(cpu); + + delta = T2-T0; + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + if (tracing_thresh) + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + else + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol(KERN_CONT "<%s>\n", data->critical_start); + printk(KERN_CONT " => ended at timestamp %lu: ", t1); + print_symbol(KERN_CONT "<%s>\n", data->critical_end); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + tracing_reset(data); + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void notrace +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + data->critical_start = parent_ip; + tracing_reset(data); + + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static inline void notrace +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +void notrace start_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +void notrace stop_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_PROVE_LOCKING +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 1; + register_ftrace_function(&trace_ops); +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + unregister_ftrace_function(&trace_ops); + tracer_enabled = 0; +} + +static void irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visibel */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_irqsoff_tracer(void) +{ + register_tracer(&irqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); -- cgit v1.2.3 From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace preempt off critical timings Add preempt off timings. A lot of kernel core code is taken from the RT patch latency trace that was written by Ingo Molnar. This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers Now instead of just tracing irqs off, preemption off can be selected to be recorded. When this is selected, it shares the same files as irqs off timings. One can either trace preemption off, irqs off, or one or the other off. By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording of preempt off only is performed. "irqsoff" will only record the time irqs are disabled, but "preemptirqsoff" will take the total time irqs or preemption are disabled. Runtime switching of these options is now supported by simpling echoing in the appropriate trace name into /debugfs/tracing/current_tracer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 24 +++++- kernel/trace/Kconfig | 25 ++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------ 4 files changed, 183 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 73e60085236..328494e28df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4365,26 +4366,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 896df1cf6ad..6430016b98e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,6 +44,31 @@ config IRQSOFF_TRACER echo 0 > /debugfs/tracing/tracing_max_latency + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46be8647fb6..3fec653d653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a9131b0cf1a..8b1231633dc 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -21,6 +21,36 @@ static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; +static DEFINE_PER_CPU(int, tracing_cpu); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int notrace +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int notrace +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!tracer_enabled)) + if (likely(!__get_cpu_var(tracing_cpu))) return; local_save_flags(flags); - if (!irqs_disabled_flags(flags)) - return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; + if (__get_cpu_var(tracing_cpu)) + return; + cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!data->trace) || - data->critical_start || atomic_read(&data->disabled)) + atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); - data->critical_start = parent_ip; + data->critical_start = parent_ip ? : ip; tracing_reset(data); local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + __get_cpu_var(tracing_cpu) = 1; + atomic_dec(&data->disabled); } @@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(__get_cpu_var(tracing_cpu))) + __get_cpu_var(tracing_cpu) = 0; + else + return; + + if (!tracer_enabled) return; cpu = raw_smp_processor_id(); @@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); ftrace(tr, data, ip, parent_ip, flags); - check_critical_timing(tr, data, parent_ip, cpu); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); } +/* start and stop critical timings used to for stoppage (in idle) */ void notrace start_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } void notrace stop_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +#ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); } @@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void notrace trace_hardirqs_on(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); void notrace trace_hardirqs_off(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); void notrace trace_hardirqs_on_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); void notrace trace_hardirqs_off_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +{ + stop_critical_timing(a0, a1); +} + +void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ static void start_irqsoff_tracer(struct trace_array *tr) { @@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) tracer_enabled = 0; } -static void irqsoff_tracer_init(struct trace_array *tr) +static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; /* make sure that the tracer is visibel */ @@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter) start_irqsoff_tracer(iter->tr); } +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly = .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, }; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif __init static int init_irqsoff_tracer(void) { - register_tracer(&irqsoff_tracer); + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); return 0; } -- cgit v1.2.3 From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: dynamic enabling/disabling of function calls This patch adds a feature to dynamically replace the ftrace code with the jmps to allow a kernel with ftrace configured to run as fast as it can without it configured. The way this works, is on bootup (if ftrace is enabled), a ftrace function is registered to record the instruction pointer of all places that call the function. Later, if there's still any code to patch, a kthread is awoken (rate limited to at most once a second) that performs a stop_machine, and replaces all the code that was called with a jmp over the call to ftrace. It only replaces what was found the previous time. Typically the system reaches equilibrium quickly after bootup and there's no code patching needed at all. e.g. call ftrace /* 5 bytes */ is replaced with jmp 3f /* jmp is 2 bytes and we jump 3 forward */ 3: When we want to enable ftrace for function tracing, the IP recording is removed, and stop_machine is called again to replace all the locations of that were recorded back to the call of ftrace. When it is disabled, we replace the code back to the jmp. Allocation is done by the kthread. If the ftrace recording function is called, and we don't have any record slots available, then we simply skip that call. Once a second a new page (if needed) is allocated for recording new ftrace function calls. A large batch is allocated at boot up to get most of the calls there. Because we do this via stop_machine, we don't have to worry about another CPU executing a ftrace call as we modify it. But we do need to worry about NMI's so all functions that might be called via nmi must be annotated with notrace_nmi. When this code is configured in, the NMI code will not call notrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 17 +++ kernel/trace/ftrace.c | 356 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 341 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6430016b98e..cad9db1dee0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -88,3 +88,20 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FTRACE + default y + help + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replaces them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. + + This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + has native performance as long as no tracing is active. + + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b6a80b98a3f..d1ae2ba2527 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -13,10 +13,19 @@ * Copyright (C) 2004 William Lee Irwin III */ -#include +#include +#include +#include +#include +#include #include +#include +#include +#include + +#include "trace.h" -static DEFINE_SPINLOCK(ftrace_func_lock); +static DEFINE_SPINLOCK(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) } /** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. + * clear_ftrace_function - reset the ftrace function * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag */ -int register_ftrace_function(struct ftrace_ops *ops) +void clear_ftrace_function(void) { - unsigned long flags; + ftrace_trace_function = ftrace_stub; +} + +static int notrace __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); - spin_lock_irqsave(&ftrace_func_lock, flags); ops->next = ftrace_list; /* * We are entering ops into the ftrace_list but another @@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops) */ smp_wmb(); ftrace_list = ops; + /* * For one func, simply call it directly. * For more than one func, call the chain. @@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; - spin_unlock_irqrestore(&ftrace_func_lock, flags); + + spin_unlock(&ftrace_lock); return 0; } -/** - * unregister_ftrace_function - unresgister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) +static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) { - unsigned long flags; struct ftrace_ops **p; int ret = 0; - spin_lock_irqsave(&ftrace_func_lock, flags); + spin_lock(&ftrace_lock); /* - * If we are the only function, then the ftrace pointer is - * pointing directly to that function. + * If we are removing the last function, then simply point + * to the ftrace_stub. */ if (ftrace_list == ops && ops->next == &ftrace_list_end) { ftrace_trace_function = ftrace_stub; @@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) ftrace_trace_function = ftrace_list->func; out: - spin_unlock_irqrestore(&ftrace_func_lock, flags); + spin_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); + +static int ftraced_trigger; +static int ftraced_suspend; + +static int ftrace_record_suspend; + +static inline int +notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void notrace +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head(&node->node, &ftrace_hash[key]); +} + +static void notrace +ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + + resched = need_resched(); + preempt_disable_notrace(); + + /* We simply need to protect against recursion */ + __get_cpu_var(ftrace_shutdown_disable_cpu)++; + if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. The arch alloc is responsible + * for seeing if the IP has already changed, and if + * it has, the alloc will fail. + */ + node = ftrace_alloc_shutdown_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + __get_cpu_var(ftrace_shutdown_disable_cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops ftrace_shutdown_ops __read_mostly = +{ + .func = ftrace_record_ip, +}; + + +static int notrace __ftrace_modify_code(void *data) +{ + void (*func)(void) = data; + + func(); + return 0; +} + +static void notrace ftrace_run_startup_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); +} + +static void notrace ftrace_run_shutdown_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); +} + +static void notrace ftrace_startup(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend != 1) + goto out; + __unregister_ftrace_function(&ftrace_shutdown_ops); + + ftrace_run_startup_code(); + out: + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (ftraced_suspend) + goto out; + + ftrace_run_shutdown_code(); + + __register_ftrace_function(&ftrace_shutdown_ops); + out: + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int notrace __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + cycle_t start, stop; + int i; + + /* Don't be calling ftrace ops now */ + __unregister_ftrace_function(&ftrace_shutdown_ops); + + start = now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + ftrace_code_disable(p); + ftrace_update_cnt++; + } + + } + + stop = now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + + __register_ftrace_function(&ftrace_shutdown_ops); return 0; } +static void notrace ftrace_update_code(void) +{ + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); +} + +static int notrace ftraced(void *ignore) +{ + unsigned long usecs; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + /* check once a second */ + schedule_timeout(HZ); + + mutex_lock(&ftraced_lock); + if (ftraced_trigger && !ftraced_suspend) { + ftrace_record_suspend++; + ftrace_update_code(); + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + WARN_ON_ONCE(1); + } + ftraced_trigger = 0; + ftrace_record_suspend--; + } + mutex_unlock(&ftraced_lock); + + ftrace_shutdown_replenish(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init notrace ftrace_shutdown_init(void) +{ + struct task_struct *p; + int ret; + + ret = ftrace_shutdown_arch_init(); + if (ret) + return ret; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) + return -1; + + __register_ftrace_function(&ftrace_shutdown_ops); + + return 0; +} + +core_initcall(ftrace_shutdown_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + /** - * clear_ftrace_function - reset the ftrace function + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. */ -void clear_ftrace_function(void) +int register_ftrace_function(struct ftrace_ops *ops) { - ftrace_trace_function = ftrace_stub; + ftrace_startup(); + + return __register_ftrace_function(ops); +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + ret = __unregister_ftrace_function(ops); + + if (ftrace_list == &ftrace_list_end) + ftrace_shutdown(); + + return ret; } -- cgit v1.2.3 From b0fc494fae96a7089f3651cb451f461c7291244c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add ftrace_enabled sysctl to disable mcount function This patch adds back the sysctl ftrace_enabled. This time it is defaulted to on, if DYNAMIC_FTRACE is configured. When ftrace_enabled is disabled, the ftrace function is set to the stub return. If DYNAMIC_FTRACE is also configured, on ftrace_enabled = 0, the registered ftrace functions will all be set to jmps, but no more new calls to ftrace recording (used to find the ftrace calling sites) will be called. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sysctl.c | 11 +++++ kernel/trace/ftrace.c | 125 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 118 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..efaf7c5500e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1ae2ba2527..d3de37299ba 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -20,12 +20,24 @@ #include #include #include +#include #include #include #include "trace.h" +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_ENABLED_INIT 1 +#else +# define FTRACE_ENABLED_INIT 0 +#endif + +int ftrace_enabled = FTRACE_ENABLED_INIT; +static int last_ftrace_enabled = FTRACE_ENABLED_INIT; + static DEFINE_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -78,14 +90,16 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops) smp_wmb(); ftrace_list = ops; - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } spin_unlock(&ftrace_lock); @@ -120,10 +134,12 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; - /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } out: spin_unlock(&ftrace_lock); @@ -263,7 +279,8 @@ static void notrace ftrace_startup(void) goto out; __unregister_ftrace_function(&ftrace_shutdown_ops); - ftrace_run_startup_code(); + if (ftrace_enabled) + ftrace_run_startup_code(); out: mutex_unlock(&ftraced_lock); } @@ -275,13 +292,32 @@ static void notrace ftrace_shutdown(void) if (ftraced_suspend) goto out; - ftrace_run_shutdown_code(); + if (ftrace_enabled) + ftrace_run_shutdown_code(); __register_ftrace_function(&ftrace_shutdown_ops); out: mutex_unlock(&ftraced_lock); } +static void notrace ftrace_startup_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + ftrace_run_startup_code(); + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + ftrace_run_shutdown_code(); + mutex_unlock(&ftraced_lock); +} + static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; @@ -341,8 +377,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftraced_trigger && !ftraced_suspend) { + if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { ftrace_record_suspend++; ftrace_update_code(); usecs = nsecs_to_usecs(ftrace_update_time); @@ -360,6 +397,7 @@ static int notrace ftraced(void *ignore) ftrace_record_suspend--; } mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); ftrace_shutdown_replenish(); @@ -389,8 +427,10 @@ static int __init notrace ftrace_shutdown_init(void) core_initcall(ftrace_shutdown_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** @@ -406,9 +446,15 @@ core_initcall(ftrace_shutdown_init); */ int register_ftrace_function(struct ftrace_ops *ops) { + int ret; + + mutex_lock(&ftrace_sysctl_lock); ftrace_startup(); - return __register_ftrace_function(ops); + ret = __register_ftrace_function(ops); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; } /** @@ -421,10 +467,53 @@ int unregister_ftrace_function(struct ftrace_ops *ops) { int ret; + mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); if (ftrace_list == &ftrace_list_end) ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +notrace int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); return ret; } -- cgit v1.2.3 From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: move memory management out of arch code This patch moves the memory management of the ftrace records out of the arch code and into the generic code making the arch code simpler. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d3de37299ba..f6d9af3bf66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -156,6 +156,21 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); +struct ftrace_page { + struct ftrace_page *next; + int index; + struct dyn_ftrace records[]; +} __attribute__((packed)); + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + static int ftraced_trigger; static int ftraced_suspend; @@ -184,6 +199,21 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } +static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + /* If this was already converted, skip it */ + if (ftrace_ip_converted(ip)) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + static void notrace ftrace_record_ip(unsigned long ip, unsigned long parent_ip) { @@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly = .func = ftrace_record_ip, }; +#define MCOUNT_ADDR ((long)(&mcount)) + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long ip; + int failed; + int i; + + if (saved) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + ip = rec->ip; + + if (saved) + new = ftrace_call_replace(ip, MCOUNT_ADDR); + else + old = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; + } + } +} + +static notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +static notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +static notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} static int notrace __ftrace_modify_code(void *data) { @@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data) return 0; } +static notrace void +ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, addr); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + static void notrace ftrace_run_startup_code(void) { stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); @@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); + ftrace_code_disable(p, MCOUNT_ADDR); ftrace_update_cnt++; } @@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore) return 0; } +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + int ret; + + ret = ftrace_dyn_arch_init(); + if (ret) + return ret; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + static int __init notrace ftrace_shutdown_init(void) { struct task_struct *p; int ret; - ret = ftrace_shutdown_arch_init(); + ret = ftrace_dyn_table_alloc(); if (ret) return ret; -- cgit v1.2.3 From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use dynamic patching for updating mcount calls This patch replaces the indirect call to the mcount function pointer with a direct call that will be patched by the dynamic ftrace routines. On boot up, the mcount function calls the ftace_stub function. When the dynamic ftrace code is initialized, the ftrace_stub is replaced with a call to the ftrace_record_ip, which records the instruction pointers of the locations that call it. Later, the ftraced daemon will call kstop_machine and patch all the locations to nops. When a ftrace is enabled, the original calls to mcount will now be set top call ftrace_caller, which will do a direct call to the registered ftrace function. This direct call is also patched when the function that should be called is updated. All patching is performed by a kstop_machine routine to prevent any type of race conditions that is associated with modifying code on the fly. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 183 +++++++++++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6d9af3bf66..88544f9bc0e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -26,14 +26,8 @@ #include "trace.h" -#ifdef CONFIG_DYNAMIC_FTRACE -# define FTRACE_ENABLED_INIT 1 -#else -# define FTRACE_ENABLED_INIT 0 -#endif - -int ftrace_enabled = FTRACE_ENABLED_INIT; -static int last_ftrace_enabled = FTRACE_ENABLED_INIT; +int ftrace_enabled; +static int last_ftrace_enabled; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); @@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - /* If this was already converted, skip it */ - if (ftrace_ip_converted(ip)) - return NULL; - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) } static void notrace -ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; unsigned long flags; @@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) int resched; int atomic; + if (!ftrace_enabled) + return; + resched = need_resched(); preempt_disable_notrace(); @@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) /* * There's a slight race that the ftraced will update the - * hash and reset here. The arch alloc is responsible - * for seeing if the IP has already changed, and if - * it has, the alloc will fail. + * hash and reset here. If it is already converted, skip it. */ - node = ftrace_alloc_shutdown_node(ip); + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; @@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -static struct ftrace_ops ftrace_shutdown_ops __read_mostly = -{ - .func = ftrace_record_ip, -}; - +#define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) static void notrace ftrace_replace_code(int saved) @@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved) ip = rec->ip; if (saved) - new = ftrace_call_replace(ip, MCOUNT_ADDR); + new = ftrace_call_replace(ip, FTRACE_ADDR); else - old = ftrace_call_replace(ip, MCOUNT_ADDR); + old = ftrace_call_replace(ip, FTRACE_ADDR); failed = ftrace_modify_code(ip, old, new); if (failed) @@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved) } } -static notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -static notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - static notrace void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) @@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static int notrace __ftrace_modify_code(void *data) -{ - void (*func)(void) = data; - - func(); - return 0; -} - static notrace void -ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; @@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, addr); + call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); if (failed) rec->flags |= FTRACE_FL_FAILED; } -static void notrace ftrace_run_startup_code(void) +static int notrace __ftrace_modify_code(void *data) { - stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) + ftrace_replace_code(1); + else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; } -static void notrace ftrace_run_shutdown_code(void) +static void notrace ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +static ftrace_func_t saved_ftrace_func; + static void notrace ftrace_startup(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend++; - if (ftraced_suspend != 1) + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) goto out; - __unregister_ftrace_function(&ftrace_shutdown_ops); - if (ftrace_enabled) - ftrace_run_startup_code(); + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend--; - if (ftraced_suspend) - goto out; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; - if (ftrace_enabled) - ftrace_run_shutdown_code(); + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } - __register_ftrace_function(&ftrace_shutdown_ops); + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_startup_sysctl(void) { + int command = FTRACE_ENABLE_MCOUNT; + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; /* ftraced_suspend is true if we want ftrace running */ if (ftraced_suspend) - ftrace_run_startup_code(); + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown_sysctl(void) { + int command = FTRACE_DISABLE_MCOUNT; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) - ftrace_run_shutdown_code(); + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } @@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore) struct dyn_ftrace *p; struct hlist_head head; struct hlist_node *t; + int save_ftrace_enabled; cycle_t start, stop; int i; - /* Don't be calling ftrace ops now */ - __unregister_ftrace_function(&ftrace_shutdown_ops); + /* Don't be recording funcs now */ + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; start = now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p, MCOUNT_ADDR); + ftrace_code_disable(p); ftrace_update_cnt++; } @@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore) ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - __register_ftrace_function(&ftrace_shutdown_ops); + ftrace_enabled = save_ftrace_enabled; return 0; } @@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void) struct ftrace_page *pg; int cnt; int i; - int ret; - - ret = ftrace_dyn_arch_init(); - if (ret) - return ret; /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); @@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } -static int __init notrace ftrace_shutdown_init(void) +static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; + unsigned long addr; int ret; + addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + return addr; + ret = ftrace_dyn_table_alloc(); if (ret) return ret; @@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void) if (IS_ERR(p)) return -1; - __register_ftrace_function(&ftrace_shutdown_ops); + last_ftrace_enabled = ftrace_enabled = 1; return 0; } -core_initcall(ftrace_shutdown_init); +core_initcall(ftrace_dynamic_init); #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) @@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_sysctl_lock); - ftrace_startup(); - ret = __register_ftrace_function(ops); + ftrace_startup(); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - - if (ftrace_list == &ftrace_list_end) - ftrace_shutdown(); - + ftrace_shutdown(); mutex_unlock(&ftrace_sysctl_lock); return ret; -- cgit v1.2.3 From 5072c59fd45e9976d02ee6f18c7336ef97623cbc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add filter select functions to trace This patch adds two files to the debugfs system: /debugfs/tracing/available_filter_functions and /debugfs/tracing/set_ftrace_filter The available_filter_functions lists all functions that has been recorded by the ftraced that has called the ftrace_record_ip function. This is to allow users to see what functions have been converted to nops and can be enabled for tracing. To enable functions, simply echo the names (whitespace delimited) into set_ftrace_filter. Simple wildcards are also allowed. echo 'scheduler' > /debugfs/tracing/set_ftrace_filter Will have only the scheduler be activated when tracing is enabled. echo 'sched_*' > /debugfs/tracing/set_ftrace_filter Will have only the functions starting with 'sched_' be activated. echo '*lock' > /debugfs/tracing/set_ftrace_filter Will have only functions ending with 'lock' be activated. echo '*lock*' > /debugfs/tracing/set_ftrace_filter Will have only functions with 'lock' in its name be activated. Note: 'sched*lock' will not work. The only wildcards that are allowed is an asterisk and the beginning and or end of the string passed in. Multiple names can be passed in with whitespace delimited: echo 'scheduler *lock *acpi*' > /debugfs/tracing/set_ftrace_filter is also the same as: echo 'scheduler' > /debugfs/tracing/set_ftrace_filter echo '*lock' >> /debugfs/tracing/set_ftrace_filter echo '*acpi*' >> /debugfs/tracing/set_ftrace_filter Appending does just that. It appends to the list. To disable all filters simply echo an empty line in: echo > /debugfs/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 527 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 510 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 88544f9bc0e..97d5cb7b7e7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -16,12 +16,15 @@ #include #include #include +#include +#include #include #include #include -#include +#include #include #include +#include #include #include "trace.h" @@ -151,12 +154,15 @@ enum { FTRACE_DISABLE_MCOUNT = (1 << 4), }; +static int ftrace_filtered; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_filter_lock); struct ftrace_page { struct ftrace_page *next; @@ -282,16 +288,82 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) -static void notrace ftrace_replace_code(int saved) +static void notrace +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + unsigned long fl; + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) + new = ftrace_call_replace(ip, FTRACE_ADDR); + else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + +static void notrace ftrace_replace_code(int enable) { unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long ip; - int failed; int i; - if (saved) + if (enable) old = ftrace_nop_replace(); else new = ftrace_nop_replace(); @@ -304,16 +376,7 @@ static void notrace ftrace_replace_code(int saved) if (rec->flags & FTRACE_FL_FAILED) continue; - ip = rec->ip; - - if (saved) - new = ftrace_call_replace(ip, FTRACE_ADDR); - else - old = ftrace_call_replace(ip, FTRACE_ADDR); - - failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + __ftrace_replace_code(rec, old, new, enable); } } } @@ -580,6 +643,436 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int notrace +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static void notrace ftrace_filter_reset(void) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static int notrace +ftrace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret = 0; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_filter_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = FTRACE_ITER_FILTER; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static ssize_t notrace +ftrace_filter_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t notrace +ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void notrace +ftrace_match(unsigned char *buff, int len) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t notrace +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_filter_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static int notrace +ftrace_filter_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_filter_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_filter_lock); + return 0; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_filter_read, + .write = ftrace_filter_write, + .llseek = ftrace_filter_lseek, + .release = ftrace_filter_release, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; @@ -657,14 +1150,14 @@ int unregister_ftrace_function(struct ftrace_ops *ops) notrace int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&ftrace_sysctl_lock); - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) goto out; -- cgit v1.2.3 From 4c11d7aed389375253b59e2b1865eec96663c65d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: convert single large buffer into single pages. Allocating large buffers for the tracer may fail easily. This patch converts the buffer from a large ordered allocation to single pages. It uses the struct page LRU field to link the pages together. Later patches may also implement dynamic increasing and decreasing of the trace buffers. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 245 +++++++++++++++++++++++++++++++++++++++------------ kernel/trace/trace.h | 8 +- 2 files changed, 195 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b8eca7650d..d7ad030a4c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); static int tracer_enabled; -static unsigned long trace_nr_entries = 4096UL; +static unsigned long trace_nr_entries = 16384UL; static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; @@ -57,6 +58,8 @@ static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); +#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) + static int __init set_nr_entries(char *str) { if (!str) @@ -103,6 +106,7 @@ static const char *trace_options[] = { static unsigned trace_flags; +static DEFINE_SPINLOCK(ftrace_max_lock); /* * Copy the new maximum trace into the separate maximum-trace @@ -136,17 +140,23 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; void *save_trace; + struct list_head save_pages; int i; + WARN_ON_ONCE(!irqs_disabled()); + spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; save_trace = max_tr.data[i]->trace; + save_pages = max_tr.data[i]->trace_pages; memcpy(max_tr.data[i], data, sizeof(*data)); data->trace = save_trace; + data->trace_pages = save_pages; } __update_max_tr(tr, tsk, cpu); + spin_unlock(&ftrace_max_lock); } /** @@ -160,16 +170,22 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; void *save_trace; + struct list_head save_pages; int i; + WARN_ON_ONCE(!irqs_disabled()); + spin_lock(&ftrace_max_lock); for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); save_trace = max_tr.data[cpu]->trace; + save_pages = max_tr.data[cpu]->trace_pages; memcpy(max_tr.data[cpu], data, sizeof(*data)); data->trace = save_trace; + data->trace_pages = save_pages; __update_max_tr(tr, tsk, cpu); + spin_unlock(&ftrace_max_lock); } int register_tracer(struct tracer *type) @@ -236,7 +252,8 @@ void unregister_tracer(struct tracer *type) void notrace tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - atomic_set(&data->underrun, 0); + data->trace_current = data->trace; + data->trace_current_idx = 0; } #ifdef CONFIG_FTRACE @@ -367,21 +384,27 @@ tracing_get_trace_entry(struct trace_array *tr, { unsigned long idx, idx_next; struct trace_entry *entry; + struct page *page; + struct list_head *next; - idx = data->trace_idx; + data->trace_idx++; + idx = data->trace_current_idx; idx_next = idx + 1; - if (unlikely(idx_next >= tr->entries)) { - atomic_inc(&data->underrun); + entry = data->trace_current + idx * TRACE_ENTRY_SIZE; + + if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { + page = virt_to_page(data->trace_current); + if (unlikely(&page->lru == data->trace_pages.prev)) + next = data->trace_pages.next; + else + next = page->lru.next; + page = list_entry(next, struct page, lru); + data->trace_current = page_address(page); idx_next = 0; } - data->trace_idx = idx_next; - - if (unlikely(idx_next != 0 && atomic_read(&data->underrun))) - atomic_inc(&data->underrun); - - entry = data->trace + idx * TRACE_ENTRY_SIZE; + data->trace_current_idx = idx_next; return entry; } @@ -442,21 +465,38 @@ enum trace_file_type { }; static struct trace_entry * -trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu) +trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) { - struct trace_entry *array = tr->data[cpu]->trace; - unsigned long underrun; + struct page *page; + struct trace_entry *array; - if (idx >= tr->entries) + if (iter->next_idx[cpu] >= tr->entries || + iter->next_idx[cpu] >= data->trace_idx) return NULL; - underrun = atomic_read(&tr->data[cpu]->underrun); - if (underrun) - idx = ((underrun - 1) + idx) % tr->entries; - else if (idx >= tr->data[cpu]->trace_idx) - return NULL; + if (!iter->next_page[cpu]) { + /* + * Initialize. If the count of elements in + * this buffer is greater than the max entries + * we had an underrun. Which means we looped around. + * We can simply use the current pointer as our + * starting point. + */ + if (data->trace_idx >= tr->entries) { + page = virt_to_page(data->trace_current); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_current_idx; + } else { + iter->next_page[cpu] = data->trace_pages.next; + iter->next_page_idx[cpu] = 0; + } + } - return &array[idx]; + page = list_entry(iter->next_page[cpu], struct page, lru); + array = page_address(page); + + return &array[iter->next_page_idx[cpu]]; } static struct notrace trace_entry * @@ -470,7 +510,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) for_each_possible_cpu(cpu) { if (!tr->data[cpu]->trace) continue; - ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu); + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); if (ent && (!next || (long)(next->idx - ent->idx) > 0)) { next = ent; @@ -492,8 +532,19 @@ static void *find_next_entry_inc(struct trace_iterator *iter) next = find_next_entry(iter, &next_cpu); if (next) { - iter->next_idx[next_cpu]++; iter->idx++; + iter->next_idx[next_cpu]++; + iter->next_page_idx[next_cpu]++; + if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[next_cpu]; + + iter->next_page_idx[next_cpu] = 0; + iter->next_page[next_cpu] = + iter->next_page[next_cpu]->next; + if (iter->next_page[next_cpu] == &data->trace_pages) + iter->next_page[next_cpu] = + data->trace_pages.next; + } } iter->ent = next; iter->cpu = next_cpu; @@ -554,14 +605,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - for (i = 0; i < NR_CPUS; i++) + for_each_possible_cpu(i) { iter->next_idx[i] = 0; + iter->next_page[i] = NULL; + } for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { - l = *pos; + l = *pos - 1; p = s_next(m, p, &l); } @@ -654,9 +707,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long underruns = 0; - unsigned long underrun; - unsigned long entries = 0; + unsigned long total = 0; + unsigned long entries = 0; int cpu; const char *name = "preemption"; @@ -665,11 +717,10 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) for_each_possible_cpu(cpu) { if (tr->data[cpu]->trace) { - underrun = atomic_read(&tr->data[cpu]->underrun); - if (underrun) { - underruns += underrun; + total += tr->data[cpu]->trace_idx; + if (tr->data[cpu]->trace_idx > tr->entries) entries += tr->entries; - } else + else entries += tr->data[cpu]->trace_idx; } } @@ -682,7 +733,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) " (M:%s VP:%d, KP:%d, SP:%d HP:%d", data->saved_latency, entries, - (entries + underruns), + total, tr->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", @@ -882,8 +933,7 @@ static int trace_empty(struct trace_iterator *iter) data = iter->tr->data[cpu]; if (data->trace && - (data->trace_idx || - atomic_read(&data->underrun))) + data->trace_idx) return 0; } return 1; @@ -1464,42 +1514,109 @@ static struct tracer no_tracer __read_mostly = .name = "none", }; -static inline notrace int page_order(const unsigned long size) +static int trace_alloc_page(void) { - const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); - return ilog2(roundup_pow_of_two(nr_pages)); + struct trace_array_cpu *data; + void *array; + struct page *page, *tmp; + LIST_HEAD(pages); + int i; + + /* first allocate a page for each CPU */ + for_each_possible_cpu(i) { + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + + page = virt_to_page(array); + list_add(&page->lru, &pages); + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + page = virt_to_page(array); + list_add(&page->lru, &pages); +#endif + } + + /* Now that we successfully allocate a page per CPU, add them */ + for_each_possible_cpu(i) { + data = global_trace.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + ClearPageLRU(page); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + SetPageLRU(page); +#endif + } + global_trace.entries += ENTRIES_PER_PAGE; + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, lru) { + list_del(&page->lru); + __free_page(page); + } + return -ENOMEM; } __init static int tracer_alloc_buffers(void) { - const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE); - const unsigned long size = (1UL << order) << PAGE_SHIFT; - struct trace_entry *array; + struct trace_array_cpu *data; + void *array; + struct page *page; + int pages = 0; int i; + /* Allocate the first page for all buffers */ for_each_possible_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); - array = (struct trace_entry *) - __get_free_pages(GFP_KERNEL, order); + array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate" - " %ld bytes for trace buffer!\n", size); + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); goto free_buffers; } - global_trace.data[i]->trace = array; + data->trace = array; + + /* set the array to the list */ + INIT_LIST_HEAD(&data->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &data->trace_pages); + /* use the LRU flag to differentiate the two buffers */ + ClearPageLRU(page); /* Only allocate if we are actually using the max trace */ #ifdef CONFIG_TRACER_MAX_TRACE - array = (struct trace_entry *) - __get_free_pages(GFP_KERNEL, order); + array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { - printk(KERN_ERR "wakeup tracer: failed to allocate" - " %ld bytes for trace buffer!\n", size); + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); goto free_buffers; } max_tr.data[i]->trace = array; + + INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &max_tr.data[i]->trace_pages); + SetPageLRU(page); #endif } @@ -1507,11 +1624,18 @@ __init static int tracer_alloc_buffers(void) * Since we allocate by orders of pages, we may be able to * round up a bit. */ - global_trace.entries = size / TRACE_ENTRY_SIZE; + global_trace.entries = ENTRIES_PER_PAGE; max_tr.entries = global_trace.entries; + pages++; + + while (global_trace.entries < trace_nr_entries) { + if (trace_alloc_page()) + break; + pages++; + } - pr_info("tracer: %ld bytes allocated for %ld", - size, trace_nr_entries); + pr_info("tracer: %d pages allocated for %ld", + pages, trace_nr_entries); pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); pr_info(" actual entries %ld\n", global_trace.entries); @@ -1526,17 +1650,26 @@ __init static int tracer_alloc_buffers(void) free_buffers: for (i-- ; i >= 0; i--) { + struct page *page, *tmp; struct trace_array_cpu *data = global_trace.data[i]; if (data && data->trace) { - free_pages((unsigned long)data->trace, order); + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del(&page->lru); + __free_page(page); + } data->trace = NULL; } #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; if (data && data->trace) { - free_pages((unsigned long)data->trace, order); + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del(&page->lru); + __free_page(page); + } data->trace = NULL; } #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3173a93561d..83e257e3808 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -54,9 +54,11 @@ struct trace_entry { */ struct trace_array_cpu { void *trace; + void *trace_current; + unsigned trace_current_idx; + struct list_head trace_pages; unsigned long trace_idx; atomic_t disabled; - atomic_t underrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -112,8 +114,10 @@ struct trace_iterator { unsigned long iter_flags; loff_t pos; unsigned long next_idx[NR_CPUS]; + struct list_head *next_page[NR_CPUS]; + unsigned next_page_idx[NR_CPUS]; + long idx; int cpu; - int idx; }; void notrace tracing_reset(struct trace_array_cpu *data); -- cgit v1.2.3 From 361943ad0ba3f16e66859e30a408915e008ba91e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: irqs off smp_processor_id() fix The irqsoff function tracer did a __get_cpu_var to determine if it should trace the function or not. The problem is that __get_cpu_var can preempt between getting the CPU and reading the cpu variable. This means that the cpu variable that is being read is not from the cpu being run on. At worst, this can give a false positive, where we trace the function when we should not. It will never give a false negative since we only want to trace when interrupts are disabled and we never preempt when they are. This fix adds a check after reading the irq flags to only trace if the interrupts are actually disabled. It also changes the reading of the cpu variable to use a raw_smp_processor_id since we now don't care if we preempt. We still catch that fact. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_irqsoff.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8b1231633dc..bd3f8819830 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -74,12 +74,21 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!__get_cpu_var(tracing_cpu))) + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) return; local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); -- cgit v1.2.3 From 0764d23cf066c52de42b653144605b481d3fbdbc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: lockdep notrace annotations Add notrace annotations to lockdep to keep ftrace from causing recursive problems with lock tracing and debugging. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/lockdep.c | 23 ++++++++++++----------- kernel/spinlock.c | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e21924365ea..ac46847ba0c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -void lockdep_off(void) +notrace void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -void lockdep_on(void) +notrace void lockdep_on(void) { current->lockdep_recursion--; } @@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline int +static noinline notrace int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr, * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static void check_chain_key(struct task_struct *curr) +static notrace void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, /* * Mark all held locks with a usage bit: */ -static int +static notrace int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2260,8 +2260,8 @@ static inline int separate_irq_context(struct task_struct *curr, /* * Mark a lock with a usage bit, and validate the state transition: */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static notrace void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2700,8 +2700,8 @@ static void check_flags(unsigned long flags) * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) +notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2723,7 +2723,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +notrace void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) { unsigned long flags; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index ae28c824512..a1fb54c93cd 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) } EXPORT_SYMBOL(_spin_trylock_bh); -int in_lock_functions(unsigned long addr) +notrace int in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; -- cgit v1.2.3 From 18cef379d30f5ded20cc31d7f2d342639d39919d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: don't use raw_local_irq_save/restore Using raw_local_irq_save/restore confuses lockdep. It's fine to use the normal ones. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_sched_switch.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7ad030a4c4..9175ce91b8f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -269,7 +269,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!tracer_enabled)) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -278,7 +278,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) ftrace(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3e4771d3b89..2715267be46 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,7 +29,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -38,7 +38,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) tracing_sched_switch_trace(tr, data, prev, next, flags); atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) -- cgit v1.2.3 From 89b2f97819dd074297bbe3e19eaa4afcc98845ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: fix updates to max trace This patch fixes some bugs to the updating of the max trace that was caused by implementing the new buffering. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 6 +++++- kernel/trace/trace_irqsoff.c | 27 +++++++++++++++------------ 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9175ce91b8f..95966561ba3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -153,6 +153,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_tr.data[i], data, sizeof(*data)); data->trace = save_trace; data->trace_pages = save_pages; + tracing_reset(data); } __update_max_tr(tr, tsk, cpu); @@ -183,6 +184,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_tr.data[cpu], data, sizeof(*data)); data->trace = save_trace; data->trace_pages = save_pages; + tracing_reset(data); __update_max_tr(tr, tsk, cpu); spin_unlock(&ftrace_max_lock); @@ -877,6 +879,8 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, entry->ctx.next_prio, comm); break; + default: + seq_printf(m, "Unknown type %d\n", entry->type); } } @@ -1625,7 +1629,6 @@ __init static int tracer_alloc_buffers(void) * round up a bit. */ global_trace.entries = ENTRIES_PER_PAGE; - max_tr.entries = global_trace.entries; pages++; while (global_trace.entries < trace_nr_entries) { @@ -1633,6 +1636,7 @@ __init static int tracer_alloc_buffers(void) break; pages++; } + max_tr.entries = global_trace.entries; pr_info("tracer: %d pages allocated for %ld", pages, trace_nr_entries); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index bd3f8819830..74165f611f3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,6 +23,8 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); +static DEFINE_SPINLOCK(max_trace_lock); + enum { TRACER_IRQS_OFF = (1 << 1), TRACER_PREEMPT_OFF = (1 << 2), @@ -126,7 +128,7 @@ check_critical_timing(struct trace_array *tr, int cpu) { unsigned long latency, t0, t1; - cycle_t T0, T1, T2, delta; + cycle_t T0, T1, delta; unsigned long flags; /* @@ -142,20 +144,18 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); - /* - * Update the timestamp, because the trace entry above - * might change it (it can only get larger so the latency - * is fair to be reported): - */ - T2 = now(cpu); + spin_lock(&max_trace_lock); - delta = T2-T0; + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); latency = nsecs_to_usecs(delta); if (data->critical_sequence != max_sequence) - goto out; + goto out_unlock; tracing_max_latency = delta; t0 = nsecs_to_usecs(T0); @@ -189,6 +189,9 @@ check_critical_timing(struct trace_array *tr, max_sequence++; +out_unlock: + spin_unlock(&max_trace_lock); + out: data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); @@ -366,14 +369,14 @@ void notrace trace_preempt_off(unsigned long a0, unsigned long a1) static void start_irqsoff_tracer(struct trace_array *tr) { - tracer_enabled = 1; register_ftrace_function(&trace_ops); + tracer_enabled = 1; } static void stop_irqsoff_tracer(struct trace_array *tr) { - unregister_ftrace_function(&trace_ops); tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); } static void __irqsoff_tracer_init(struct trace_array *tr) -- cgit v1.2.3 From 57f50be14d57b0dbf88dd019e7bb0ff3a3dc7b81 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: fix max latency Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 95966561ba3..9bad2379115 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -69,6 +69,11 @@ static int __init set_nr_entries(char *str) } __setup("trace_entries=", set_nr_entries); +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -733,7 +738,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "---------------------------------\n"); seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", - data->saved_latency, + nsecs_to_usecs(data->saved_latency), entries, total, tr->cpu, @@ -771,11 +776,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -unsigned long nsecs_to_usecs(unsigned long nsecs) -{ - return nsecs / 1000; -} - static void notrace lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) { -- cgit v1.2.3 From e1c08bdd9fa73e44096e5a82c0d5928b04ab02c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: force recording Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97d5cb7b7e7..4facf5ceeb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -146,6 +146,10 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +static struct task_struct *ftraced_task; +static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); +static unsigned long ftraced_iteration_counter; + enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -590,9 +594,12 @@ static int notrace ftraced(void *ignore) ftraced_trigger = 0; ftrace_record_suspend--; } + ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); + wake_up_interruptible(&ftraced_waiters); + ftrace_shutdown_replenish(); set_current_state(TASK_INTERRUPTIBLE); @@ -1050,6 +1057,49 @@ static struct file_operations ftrace_filter_fops = { .release = ftrace_filter_release, }; +/** + * ftrace_force_update - force an update to all recording ftrace functions + * + * The ftrace dynamic update daemon only wakes up once a second. + * There may be cases where an update needs to be done immediately + * for tests or internal kernel tracing to begin. This function + * wakes the daemon to do an update and will not return until the + * update is complete. + */ +int ftrace_force_update(void) +{ + unsigned long last_counter; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + if (!ftraced_task) + return -ENODEV; + + mutex_lock(&ftraced_lock); + last_counter = ftraced_iteration_counter; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ftraced_waiters, &wait); + + do { + mutex_unlock(&ftraced_lock); + wake_up_process(ftraced_task); + schedule(); + mutex_lock(&ftraced_lock); + if (signal_pending(current)) { + ret = -EINTR; + break; + } + set_current_state(TASK_INTERRUPTIBLE); + } while (last_counter == ftraced_iteration_counter); + + mutex_unlock(&ftraced_lock); + remove_wait_queue(&ftraced_waiters, &wait); + set_current_state(TASK_RUNNING); + + return ret; +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1095,6 +1145,7 @@ static int __init notrace ftrace_dynamic_init(void) return -1; last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; return 0; } -- cgit v1.2.3 From 60a11774b38fef1ab90b18c5353bd1c7c4d311c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: add self-tests Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 13 ++ kernel/trace/trace.c | 63 +++++- kernel/trace/trace.h | 31 +++ kernel/trace/trace_functions.c | 3 + kernel/trace/trace_irqsoff.c | 9 + kernel/trace/trace_sched_switch.c | 3 + kernel/trace/trace_sched_wakeup.c | 3 + kernel/trace/trace_selftest.c | 415 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 538 insertions(+), 2 deletions(-) create mode 100644 kernel/trace/trace_selftest.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cad9db1dee0..3f73a171024 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,3 +105,16 @@ config DYNAMIC_FTRACE wakes up once a second and checks to see if any ftrace calls were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9bad2379115..f6d026f17db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,8 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +static int tracing_disabled = 1; + static long notrace ns2usecs(cycle_t nsec) { @@ -217,11 +219,48 @@ int register_tracer(struct tracer *type) } } +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest) { + struct tracer *saved_tracer = current_trace; + struct trace_array_cpu *data; + struct trace_array *tr = &global_trace; + int saved_ctrl = tr->ctrl; + int i; + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + for_each_possible_cpu(i) { + if (!data->trace) + continue; + data = tr->data[i]; + tracing_reset(data); + } + current_trace = type; + tr->ctrl = 0; + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + tr->ctrl = saved_ctrl; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + printk(KERN_CONT "PASSED\n"); + } +#endif + type->next = trace_types; trace_types = type; len = strlen(type->name); if (len > max_tracer_type_len) max_tracer_type_len = len; + out: mutex_unlock(&trace_types_lock); @@ -985,6 +1024,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; + if (tracing_disabled) { + *ret = -ENODEV; + return NULL; + } + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { *ret = -ENOMEM; @@ -1023,6 +1067,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) int tracing_open_generic(struct inode *inode, struct file *filp) { + if (tracing_disabled) + return -ENODEV; + filp->private_data = inode->i_private; return 0; } @@ -1128,6 +1175,9 @@ static int show_traces_open(struct inode *inode, struct file *file) { int ret; + if (tracing_disabled) + return -ENODEV; + ret = seq_open(file, &show_traces_seq_ops); if (!ret) { struct seq_file *m = file->private_data; @@ -1452,6 +1502,11 @@ struct dentry *tracing_init_dentry(void) return d_tracer; } +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + static __init void tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -1585,6 +1640,7 @@ __init static int tracer_alloc_buffers(void) void *array; struct page *page; int pages = 0; + int ret = -ENOMEM; int i; /* Allocate the first page for all buffers */ @@ -1650,6 +1706,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&no_tracer); current_trace = &no_tracer; + /* All seems OK, enable tracing */ + tracing_disabled = 0; + return 0; free_buffers: @@ -1678,7 +1737,7 @@ __init static int tracer_alloc_buffers(void) } #endif } - return -ENOMEM; + return ret; } -device_initcall(tracer_alloc_buffers); +fs_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 83e257e3808..88edbf1f678 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -99,6 +99,10 @@ struct tracer { void (*start)(struct trace_iterator *iter); void (*stop)(struct trace_iterator *iter); void (*ctrl_update)(struct trace_array *tr); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif struct tracer *next; int print_max; }; @@ -185,4 +189,31 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops); extern unsigned long ftrace_update_tot_cnt; #endif +#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_FTRACE +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_IRQSOFF_TRACER +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_PREEMPT_TRACER +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SCHED_TRACER +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +#endif +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 82988c5336e..5d8ad7a0960 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -63,6 +63,9 @@ static struct tracer function_trace __read_mostly = .init = function_trace_init, .reset = function_trace_reset, .ctrl_update = function_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif }; static __init int init_function_trace(void) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 74165f611f3..14183b8f79c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -432,6 +432,9 @@ static struct tracer irqsoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -455,6 +458,9 @@ static struct tracer preemptoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -480,6 +486,9 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 2715267be46..6c9284103a6 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -107,6 +107,9 @@ static struct tracer sched_switch_trace __read_mostly = .init = sched_switch_trace_init, .reset = sched_switch_trace_reset, .ctrl_update = sched_switch_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, +#endif }; __init static int init_sched_switch_trace(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7c3ccefcf4c..3d10ff01f80 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -295,6 +295,9 @@ static struct tracer wakeup_tracer __read_mostly = .close = wakeup_tracer_close, .ctrl_update = wakeup_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 00000000000..ef4d3cc009f --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,415 @@ +/* Include in trace.c */ + +#include + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + return 1; + } + return 0; +} + +static int +trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +{ + struct page *page; + struct trace_entry *entries; + int idx = 0; + int i; + + page = list_entry(data->trace_pages.next, struct page, lru); + entries = page_address(page); + + if (data->trace != entries) + goto failed; + + /* + * The starting trace buffer always has valid elements, + * if any element exits. + */ + entries = data->trace; + + for (i = 0; i < tr->entries; i++) { + + if (i < data->trace_idx && + !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", entries[idx].type); + goto failed; + } + + idx++; + if (idx >= ENTRIES_PER_PAGE) { + page = virt_to_page(entries); + if (page->lru.next == &data->trace_pages) { + if (i != tr->entries - 1) { + printk(KERN_CONT ".. entries buffer mismatch"); + goto failed; + } + } else { + page = list_entry(page->lru.next, struct page, lru); + entries = page_address(page); + } + idx = 0; + } + } + + page = virt_to_page(entries); + if (page->lru.next != &data->trace_pages) { + printk(KERN_CONT ".. too many entries"); + goto failed; + } + + return 0; + + failed: + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long cnt = 0; + int cpu; + int ret = 0; + + for_each_possible_cpu(cpu) { + if (!tr->data[cpu]->trace) + continue; + + cnt += tr->data[cpu]->trace_idx; + printk("%d: count = %ld\n", cpu, cnt); + + ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + if (ret) + break; + } + + if (count) + *count = cnt; + + return ret; +} + +#ifdef CONFIG_FTRACE +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* make sure functions have been recorded */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tr->ctrl = 1; + trace->ctrl_update(tr); + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + out: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + struct completion *x = data; + + /* Make this a RT thread, doesn't need to be too high */ + + rt_mutex_setprio(current, MAX_RT_PRIO - 5); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (!IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +#endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v1.2.3 From c7aafc549766b87819285d3480648fc652a47bc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: cleanups factor out code and clean it up. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 8 +-- kernel/trace/trace.c | 144 ++++++++++++++++++++++++-------------- kernel/trace/trace.h | 8 ++- kernel/trace/trace_irqsoff.c | 32 ++++----- kernel/trace/trace_sched_wakeup.c | 18 ++--- kernel/trace/trace_selftest.c | 25 ++++--- 6 files changed, 133 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4facf5ceeb8..6d4d2e86deb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1152,10 +1152,10 @@ static int __init notrace ftrace_dynamic_init(void) core_initcall(ftrace_dynamic_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6d026f17db..61d2f022886 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -142,12 +142,59 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +void check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + BUG_ON(data->trace_pages.next->prev != &data->trace_pages); + BUG_ON(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + BUG_ON(page->lru.next->prev != &page->lru); + BUG_ON(page->lru.prev->next != &page->lru); + } +} + +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + check_pages(data); + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +notrace static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + tr1->trace_current = NULL; + memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_current_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + notrace void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -155,11 +202,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; - save_trace = max_tr.data[i]->trace; - save_pages = max_tr.data[i]->trace_pages; - memcpy(max_tr.data[i], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[i], data); tracing_reset(data); } @@ -177,8 +220,6 @@ notrace void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -186,11 +227,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); - save_trace = max_tr.data[cpu]->trace; - save_pages = max_tr.data[cpu]->trace_pages; - memcpy(max_tr.data[cpu], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); __update_max_tr(tr, tsk, cpu); @@ -234,9 +272,9 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_possible_cpu(i) { - if (!data->trace) - continue; data = tr->data[i]; + if (!head_page(data)) + continue; tracing_reset(data); } current_trace = type; @@ -298,7 +336,7 @@ void unregister_tracer(struct tracer *type) void notrace tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - data->trace_current = data->trace; + data->trace_current = head_page(data); data->trace_current_idx = 0; } @@ -425,26 +463,31 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) } static inline notrace struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, - struct trace_array_cpu *data) +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; struct trace_entry *entry; - struct page *page; struct list_head *next; + struct page *page; data->trace_idx++; idx = data->trace_current_idx; idx_next = idx + 1; + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + entry = data->trace_current + idx * TRACE_ENTRY_SIZE; if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { page = virt_to_page(data->trace_current); - if (unlikely(&page->lru == data->trace_pages.prev)) - next = data->trace_pages.next; - else - next = page->lru.next; + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = page->lru.next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + page = list_entry(next, struct page, lru); data->trace_current = page_address(page); idx_next = 0; @@ -456,18 +499,17 @@ tracing_get_trace_entry(struct trace_array *tr, } static inline notrace void -tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags) +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; unsigned long pc; pc = preempt_count(); - entry->idx = atomic_inc_return(&tracer_counter); - entry->preempt_count = pc & 0xff; - entry->pid = tsk->pid; - entry->t = now(raw_smp_processor_id()); + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | @@ -476,16 +518,15 @@ tracing_generic_entry_update(struct trace_entry *entry, notrace void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, - unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; } notrace void @@ -496,7 +537,7 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; entry->ctx.prev_pid = prev->pid; @@ -540,6 +581,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, } page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + array = page_address(page); return &array[iter->next_page_idx[cpu]]; @@ -554,7 +597,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) int cpu; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); if (ent && @@ -762,7 +805,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) name = type->name; for_each_possible_cpu(cpu) { - if (tr->data[cpu]->trace) { + if (head_page(tr->data[cpu])) { total += tr->data[cpu]->trace_idx; if (tr->data[cpu]->trace_idx > tr->entries) entries += tr->entries; @@ -975,8 +1018,7 @@ static int trace_empty(struct trace_iterator *iter) for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; - if (data->trace && - data->trace_idx) + if (head_page(data) && data->trace_idx) return 0; } return 1; @@ -1576,9 +1618,9 @@ static struct tracer no_tracer __read_mostly = static int trace_alloc_page(void) { struct trace_array_cpu *data; - void *array; struct page *page, *tmp; LIST_HEAD(pages); + void *array; int i; /* first allocate a page for each CPU */ @@ -1610,14 +1652,14 @@ static int trace_alloc_page(void) for_each_possible_cpu(i) { data = global_trace.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); ClearPageLRU(page); #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); SetPageLRU(page); #endif @@ -1628,7 +1670,7 @@ static int trace_alloc_page(void) free_pages: list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } return -ENOMEM; @@ -1654,7 +1696,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - data->trace = array; /* set the array to the list */ INIT_LIST_HEAD(&data->trace_pages); @@ -1671,7 +1712,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - max_tr.data[i]->trace = array; INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); page = virt_to_page(array); @@ -1716,24 +1756,22 @@ __init static int tracer_alloc_buffers(void) struct page *page, *tmp; struct trace_array_cpu *data = global_trace.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #endif } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 88edbf1f678..cc1d34b8b77 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,12 +53,12 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - void *trace; void *trace_current; - unsigned trace_current_idx; struct list_head trace_pages; - unsigned long trace_idx; atomic_t disabled; + /* these fields get copied into max-trace: */ + unsigned trace_current_idx; + unsigned long trace_idx; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -216,4 +216,6 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, #endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ +extern void *head_page(struct trace_array_cpu *data); + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 14183b8f79c..2dfebb67fdf 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -144,7 +144,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock(&max_trace_lock); + spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -165,32 +165,24 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (tracing_thresh) - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us critical section violates %lu us threshold.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); - else + latency, nsecs_to_usecs(tracing_thresh)); + } else { printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum-latency " - "critical section.\n => started at timestamp %lu: ", + " new %lu us maximum-latency critical section.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, t0); - - print_symbol(KERN_CONT "<%s>\n", data->critical_start); - printk(KERN_CONT " => ended at timestamp %lu: ", t1); - print_symbol(KERN_CONT "<%s>\n", data->critical_end); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + latency); + } max_sequence++; out_unlock: - spin_unlock(&max_trace_lock); + spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -216,7 +208,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || atomic_read(&data->disabled)) return; @@ -256,7 +248,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || !data->critical_start || atomic_read(&data->disabled)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3d10ff01f80..688df965f3f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -107,24 +107,18 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) update_max_tr(tr, wakeup_task, wakeup_cpu); if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us wakeup latency violates %lu us threshold.\n", wakeup_task->comm, wakeup_task->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); + latency, nsecs_to_usecs(tracing_thresh)); } else { - printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " - "wakeup latency.\n => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum wakeup latency.\n", wakeup_task->comm, wakeup_task->pid, - cpu, latency, t0); + cpu, latency); } - printk(KERN_CONT " ended at timestamp %lu: ", t1); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); - out_unlock: __wakeup_reset(tr); spin_unlock_irqrestore(&wakeup_lock, flags); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ef4d3cc009f..c01874c3b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,6 +1,7 @@ /* Include in trace.c */ #include +#include static inline int trace_valid_entry(struct trace_entry *entry) { @@ -15,28 +16,29 @@ static inline int trace_valid_entry(struct trace_entry *entry) static int trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) { - struct page *page; struct trace_entry *entries; + struct page *page; int idx = 0; int i; + BUG_ON(list_empty(&data->trace_pages)); page = list_entry(data->trace_pages.next, struct page, lru); entries = page_address(page); - if (data->trace != entries) + if (head_page(data) != entries) goto failed; /* * The starting trace buffer always has valid elements, - * if any element exits. + * if any element exists. */ - entries = data->trace; + entries = head_page(data); for (i = 0; i < tr->entries; i++) { - if (i < data->trace_idx && - !trace_valid_entry(&entries[idx])) { - printk(KERN_CONT ".. invalid entry %d ", entries[idx].type); + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); goto failed; } @@ -80,11 +82,10 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int ret = 0; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; cnt += tr->data[cpu]->trace_idx; - printk("%d: count = %ld\n", cpu, cnt); ret = trace_test_buffer_cpu(tr, tr->data[cpu]); if (ret) @@ -117,6 +118,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) } /* start the tracing */ + ftrace_enabled = 1; + tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ @@ -124,6 +127,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); + ftrace_enabled = 0; + /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); @@ -328,7 +333,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* create a high prio thread */ p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); - if (!IS_ERR(p)) { + if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } -- cgit v1.2.3 From 7bd2f24c2f769e3f8f1d4fc8b9fddf689825f6a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: add README make it easier for newbies to find their way around. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61d2f022886..c736dd2e068 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -995,6 +995,7 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) seq_printf(m, " <-"); seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); } + seq_printf(m, "\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? @@ -1007,7 +1008,6 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) entry->ctx.next_prio); break; } - seq_printf(m, "\n"); } static int trace_empty(struct trace_iterator *iter) @@ -1332,6 +1332,39 @@ static struct file_operations tracing_iter_fops = { .write = tracing_iter_ctrl_write, }; +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mkdir /debug\n" + "# mount -t debugfs nodev /debug\n\n" + "# cat /debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "# cat /debug/tracing/current_tracer\n" + "none\n" + "# echo sched_switch > /debug/tracing/current_tracer\n" + "# cat /debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /debug/tracing/iter_ctrl\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo 1 > /debug/tracing/tracing_enabled\n" + "# cat /debug/tracing/trace > /tmp/trace.txt\n" + "echo 0 > /debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, +}; + + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1598,6 +1631,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'tracing_threash' entry\n"); + entry = debugfs_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, -- cgit v1.2.3 From 77a2b37d227483fe52aead242652aee406c25bf0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: startup tester on dynamic tracing. This patch adds a startup self test on dynamic code modification and filters. The test filters on a specific function, makes sure that no other function is traced, exectutes the function, then makes sure that the function is traced. This patch also fixes a slight bug with the ftrace selftest, where tracer_enabled was not being set. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 19 +++++++ kernel/trace/trace_selftest.c | 113 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 128 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d4d2e86deb..5e9389faaf7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1010,6 +1010,25 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, return ret; } +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + mutex_lock(&ftrace_filter_lock); + if (reset) + ftrace_filter_reset(); + if (buf) + ftrace_match(buf, len); + mutex_unlock(&ftrace_filter_lock); +} + static int notrace ftrace_filter_release(struct inode *inode, struct file *file) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c01874c3b1f..4c8a1b2d823 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -99,6 +99,100 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) } #ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +#define __STR(x) #x +#define STR(x) __STR(x) +static int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* filter only on our function */ + ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME), + sizeof(STR(DYN_FTRACE_TEST_NAME)), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed .."); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -109,8 +203,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { unsigned long count; int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; - /* make sure functions have been recorded */ + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ ret = ftrace_force_update(); if (ret) { printk(KERN_CONT ".. ftraced failed .. "); @@ -119,6 +218,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* start the tracing */ ftrace_enabled = 1; + tracer_enabled = 1; tr->ctrl = 1; trace->init(tr); @@ -136,8 +236,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + goto out; } + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + return ret; } #endif /* CONFIG_FTRACE */ @@ -415,6 +523,3 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr return ret; } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - -#ifdef CONFIG_DYNAMIC_FTRACE -#endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v1.2.3 From 4e3c3333f3bd7eedfd21b1155b3c7cd24fc7f754 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: fix time offset fix time offset calculations and ordering, plus make code more consistent. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 9 ++++++- 2 files changed, 63 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c736dd2e068..8755a437048 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,7 +120,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock); * structure. (this way the maximum trace is permanently saved, * for later retrieval via /debugfs/tracing/latency_trace) */ -static void notrace +static notrace void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -333,15 +333,16 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void notrace tracing_reset(struct trace_array_cpu *data) +notrace void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; data->trace_current = head_page(data); data->trace_current_idx = 0; + data->time_offset = 0; } #ifdef CONFIG_FTRACE -static void notrace +static notrace void function_trace_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; @@ -398,7 +399,7 @@ static void trace_init_cmdlines(void) notrace void trace_stop_cmdline_recording(void); -static void notrace trace_save_cmdline(struct task_struct *tsk) +static notrace void trace_save_cmdline(struct task_struct *tsk) { unsigned map; unsigned idx; @@ -624,6 +625,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter) iter->idx++; iter->next_idx[next_cpu]++; iter->next_page_idx[next_cpu]++; + if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { struct trace_array_cpu *data = iter->tr->data[next_cpu]; @@ -635,19 +637,21 @@ static void *find_next_entry_inc(struct trace_iterator *iter) data->trace_pages.next; } } + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + iter->ent = next; iter->cpu = next_cpu; return next ? iter : NULL; } -static void notrace * -s_next(struct seq_file *m, void *v, loff_t *pos) +static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; - void *ent; void *last_ent = iter->ent; int i = (int)*pos; + void *ent; (*pos)++; @@ -693,6 +697,8 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->ent = NULL; iter->cpu = 0; iter->idx = -1; + iter->prev_ent = NULL; + iter->prev_cpu = -1; for_each_possible_cpu(i) { iter->next_idx[i] = 0; @@ -752,7 +758,7 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) # define IP_FMT "%016lx" #endif -static void notrace +static notrace void seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) { if (!ip) { @@ -769,7 +775,7 @@ seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) seq_printf(m, " <" IP_FMT ">", ip); } -static void notrace print_lat_help_header(struct seq_file *m) +static notrace void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); seq_puts(m, "# / _-----=> irqs-off \n"); @@ -782,14 +788,14 @@ static void notrace print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void notrace print_func_help_header(struct seq_file *m) +static notrace void print_func_help_header(struct seq_file *m) { seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void notrace +static notrace void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -858,7 +864,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static void notrace +static notrace void lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) { int hardirq, softirq; @@ -895,7 +901,7 @@ lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; -static void notrace +static notrace void lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, unsigned long rel_usecs) { @@ -910,7 +916,7 @@ lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static void notrace +static notrace void print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -966,20 +972,50 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, } } -static void notrace +static notrace void sync_time_offset(struct trace_iterator *iter) +{ + struct trace_array_cpu *prev_array, *array; + struct trace_entry *prev_entry, *entry; + cycle_t prev_t, t; + + entry = iter->ent; + prev_entry = iter->prev_ent; + if (!prev_entry) + return; + + prev_array = iter->tr->data[iter->prev_cpu]; + array = iter->tr->data[iter->cpu]; + + prev_t = prev_entry->t + prev_array->time_offset; + t = entry->t + array->time_offset; + + /* + * If time goes backwards we increase the offset of + * the current array, to not have observable time warps. + * This will quickly synchronize the time offsets of + * multiple CPUs: + */ + if (t < prev_t) + array->time_offset += prev_t - t; +} + +static notrace void print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *entry = iter->ent; + struct trace_entry *entry; unsigned long usec_rem; unsigned long long t; unsigned long secs; char *comm; int S; + sync_time_offset(iter); + entry = iter->ent; + comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t); + t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; @@ -1158,7 +1194,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file) } -static void notrace * +static notrace void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct tracer *t = m->private; @@ -1374,8 +1410,7 @@ tracing_ctrl_read(struct file *filp, char __user *ubuf, int r; r = sprintf(buf, "%ld\n", tr->ctrl); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc1d34b8b77..5df8ff2b84a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,6 +56,8 @@ struct trace_array_cpu { void *trace_current; struct list_head trace_pages; atomic_t disabled; + cycle_t time_offset; + /* these fields get copied into max-trace: */ unsigned trace_current_idx; unsigned long trace_idx; @@ -114,14 +116,19 @@ struct tracer { struct trace_iterator { struct trace_array *tr; struct tracer *trace; + struct trace_entry *ent; + int cpu; + + struct trace_entry *prev_ent; + int prev_cpu; + unsigned long iter_flags; loff_t pos; unsigned long next_idx[NR_CPUS]; struct list_head *next_page[NR_CPUS]; unsigned next_page_idx[NR_CPUS]; long idx; - int cpu; }; void notrace tracing_reset(struct trace_array_cpu *data); -- cgit v1.2.3 From 08bafa0efcf29fe18ec39c2147077b597368b018 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: disable all tracers on corrupted buffer If the trace buffer is detected to be corrupted, then we disable all tracers. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_selftest.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 4c8a1b2d823..a6f1ed75f83 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -67,6 +67,8 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) return 0; failed: + /* disable tracing */ + tracing_disabled = 1; printk(KERN_CONT ".. corrupted trace buffer .. "); return -1; } -- cgit v1.2.3 From 1d4db00a5e30c7b8f8dc2a1b19e886fd942be143 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: reset selftests The tests may leave stuff in the buffers. This resets the buffers after each test is run. If a test fails, it does not reset the buffer to avoid touching a buffer that is corrupted. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8755a437048..6580e7ed04b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -289,6 +289,13 @@ int register_tracer(struct tracer *type) printk(KERN_CONT "FAILED!\n"); goto out; } + /* Only reset on passing, to avoid touching corrupted buffers */ + for_each_possible_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } printk(KERN_CONT "PASSED\n"); } #endif -- cgit v1.2.3 From 93a588f459da134be6ab17c4104e28441beb0d22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: change buffers to producer consumer This patch changes the way the CPU trace buffers are handled. Instead of always starting from the trace page head, the logic is changed to a producer consumer logic. This allows for the buffers to be drained while they are alive. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 103 ++++++++++++++++++++++++++++++--------------------- kernel/trace/trace.h | 6 ++- 2 files changed, 65 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6580e7ed04b..777b859e1c2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,10 +176,9 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) INIT_LIST_HEAD(&flip_pages); - tr1->trace_current = NULL; - memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx, + memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_current_idx)); + offsetof(struct trace_array_cpu, trace_head_idx)); check_pages(tr1); check_pages(tr2); @@ -228,7 +227,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_reset(max_tr.data[i]); flip_trace(max_tr.data[cpu], data); - tracing_reset(data); __update_max_tr(tr, tsk, cpu); @@ -343,9 +341,9 @@ void unregister_tracer(struct tracer *type) notrace void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - data->trace_current = head_page(data); - data->trace_current_idx = 0; - data->time_offset = 0; + data->trace_head = data->trace_tail = head_page(data); + data->trace_head_idx = 0; + data->trace_tail_idx = 0; } #ifdef CONFIG_FTRACE @@ -470,38 +468,65 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } +static inline notrace struct list_head * +trace_next_list(struct trace_array_cpu *data, struct list_head *next) +{ + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = next->next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + + return next; +} + +static inline notrace void * +trace_next_page(struct trace_array_cpu *data, void *addr) +{ + struct list_head *next; + struct page *page; + + page = virt_to_page(addr); + + next = trace_next_list(data, &page->lru); + page = list_entry(next, struct page, lru); + + return page_address(page); +} + static inline notrace struct trace_entry * tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; struct trace_entry *entry; - struct list_head *next; - struct page *page; data->trace_idx++; - idx = data->trace_current_idx; + idx = data->trace_head_idx; idx_next = idx + 1; BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - entry = data->trace_current + idx * TRACE_ENTRY_SIZE; + entry = data->trace_head + idx * TRACE_ENTRY_SIZE; if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - page = virt_to_page(data->trace_current); - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = page->lru.next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - page = list_entry(next, struct page, lru); - data->trace_current = page_address(page); + data->trace_head = trace_next_page(data, data->trace_head); idx_next = 0; } - data->trace_current_idx = idx_next; + if (data->trace_head == data->trace_tail && + idx_next == data->trace_tail_idx) { + /* overrun */ + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = + trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + } + + data->trace_head_idx = idx_next; return entry; } @@ -571,21 +596,11 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return NULL; if (!iter->next_page[cpu]) { - /* - * Initialize. If the count of elements in - * this buffer is greater than the max entries - * we had an underrun. Which means we looped around. - * We can simply use the current pointer as our - * starting point. - */ - if (data->trace_idx >= tr->entries) { - page = virt_to_page(data->trace_current); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_current_idx; - } else { - iter->next_page[cpu] = data->trace_pages.next; - iter->next_page_idx[cpu] = 0; - } + /* Initialize the iterator for this cpu trace buffer */ + WARN_ON(!data->trace_tail); + page = virt_to_page(data->trace_tail); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_tail_idx; } page = list_entry(iter->next_page[cpu], struct page, lru); @@ -593,6 +608,12 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, array = page_address(page); + /* Still possible to catch up to the tail */ + if (iter->next_idx[cpu] && array == data->trace_tail && + iter->next_page_idx[cpu] == data->trace_tail_idx) + return NULL; + + WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); return &array[iter->next_page_idx[cpu]]; } @@ -638,10 +659,8 @@ static void *find_next_entry_inc(struct trace_iterator *iter) iter->next_page_idx[next_cpu] = 0; iter->next_page[next_cpu] = - iter->next_page[next_cpu]->next; - if (iter->next_page[next_cpu] == &data->trace_pages) - iter->next_page[next_cpu] = - data->trace_pages.next; + trace_next_list(data, iter->next_page[next_cpu]); + } } iter->prev_ent = iter->ent; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5df8ff2b84a..0ce127455b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,13 +53,15 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - void *trace_current; struct list_head trace_pages; atomic_t disabled; cycle_t time_offset; /* these fields get copied into max-trace: */ - unsigned trace_current_idx; + unsigned trace_head_idx; + unsigned trace_tail_idx; + void *trace_head; /* producer */ + void *trace_tail; /* consumer */ unsigned long trace_idx; unsigned long saved_latency; unsigned long critical_start; -- cgit v1.2.3 From 214023c3d13a71525e463b5e54e360b926b4dc90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: add a buffer for output Later patches will need to print the same things as the seq output does. But those outputs will not use the seq utility. This patch adds a buffer to the iterator, that can be used by either the seq utility or other output. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 202 ++++++++++++++++++++++++++++++++++----------------- kernel/trace/trace.h | 6 ++ 2 files changed, 140 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 777b859e1c2..d39f4faec7c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -169,6 +169,66 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } +static notrace int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + + if (!len) + return 0; + + va_start(ap, fmt); + len = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + s->len += len; + + return len; +} + +static notrace int +trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + len = (PAGE_SIZE - 1) - s->len; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +static notrace int +trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +static notrace void +trace_seq_reset(struct trace_seq *s) +{ + s->len = 0; +} + +static notrace void +trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_reset(s); +} + notrace static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { @@ -756,25 +816,26 @@ static void s_stop(struct seq_file *m, void *p) } static void -seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address) +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; kallsyms_lookup(address, NULL, NULL, NULL, str); - seq_printf(m, fmt, str); + trace_seq_printf(s, fmt, str); #endif } static void -seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; sprint_symbol(str, address); - seq_printf(m, fmt, str); + trace_seq_printf(s, fmt, str); #endif } @@ -785,20 +846,20 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) #endif static notrace void -seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { if (!ip) { - seq_printf(m, "0"); + trace_seq_printf(s, "0"); return; } if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(m, "%s", ip); + seq_print_sym_offset(s, "%s", ip); else - seq_print_sym_short(m, "%s", ip); + seq_print_sym_short(s, "%s", ip); if (sym_flags & TRACE_ITER_SYM_ADDR) - seq_printf(m, " <" IP_FMT ">", ip); + trace_seq_printf(s, " <" IP_FMT ">", ip); } static notrace void print_lat_help_header(struct seq_file *m) @@ -881,9 +942,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) if (data->critical_start) { seq_puts(m, " => started at: "); - seq_print_ip_sym(m, data->critical_start, sym_flags); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); seq_puts(m, "\n => ended at: "); - seq_print_ip_sym(m, data->critical_end, sym_flags); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); seq_puts(m, "\n"); } @@ -891,61 +954,61 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) } static notrace void -lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; char *comm; comm = trace_find_cmdline(entry->pid); - seq_printf(m, "%8.8s-%-5d ", comm, entry->pid); - seq_printf(m, "%d", cpu); - seq_printf(m, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; if (hardirq && softirq) - seq_putc(m, 'H'); + trace_seq_putc(s, 'H'); else { if (hardirq) - seq_putc(m, 'h'); + trace_seq_putc(s, 'h'); else { if (softirq) - seq_putc(m, 's'); + trace_seq_putc(s, 's'); else - seq_putc(m, '.'); + trace_seq_putc(s, '.'); } } if (entry->preempt_count) - seq_printf(m, "%x", entry->preempt_count); + trace_seq_printf(s, "%x", entry->preempt_count); else - seq_puts(m, "."); + trace_seq_puts(s, "."); } unsigned long preempt_mark_thresh = 100; static notrace void -lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, unsigned long rel_usecs) { - seq_printf(m, " %4lldus", abs_usecs); + trace_seq_printf(s, " %4lldus", abs_usecs); if (rel_usecs > preempt_mark_thresh) - seq_puts(m, "!: "); + trace_seq_puts(s, "!: "); else if (rel_usecs > 1) - seq_puts(m, "+: "); + trace_seq_puts(s, "+: "); else - seq_puts(m, " : "); + trace_seq_puts(s, " : "); } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; static notrace void -print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, - unsigned int trace_idx, int cpu) +print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { + struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *next_entry = find_next_entry(iter, NULL); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); @@ -962,39 +1025,40 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, if (verbose) { comm = trace_find_cmdline(entry->pid); - seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(entry->t), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); } else { - lat_print_generic(m, entry, cpu); - lat_print_timestamp(m, abs_usecs, rel_usecs); + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(m, entry->fn.ip, sym_flags); - seq_puts(m, " ("); - seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); - seq_puts(m, ")\n"); + seq_print_ip_sym(s, entry->fn.ip, sym_flags); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + trace_seq_puts(s, ")\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); - seq_printf(m, " %d:%d:%c --> %d:%d %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio, - comm); + trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + comm); break; default: - seq_printf(m, "Unknown type %d\n", entry->type); + trace_seq_printf(s, "Unknown type %d\n", entry->type); } } @@ -1026,8 +1090,9 @@ static notrace void sync_time_offset(struct trace_iterator *iter) } static notrace void -print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) +print_trace_fmt(struct trace_iterator *iter) { + struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; unsigned long usec_rem; @@ -1045,29 +1110,29 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - seq_printf(m, "%16s-%-5d ", comm, entry->pid); - seq_printf(m, "[%02d] ", iter->cpu); - seq_printf(m, "%5lu.%06lu: ", secs, usec_rem); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "[%02d] ", iter->cpu); + trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(m, entry->fn.ip, sym_flags); + seq_print_ip_sym(s, entry->fn.ip, sym_flags); if ((sym_flags & TRACE_ITER_PRINT_PARENT) && entry->fn.parent_ip) { - seq_printf(m, " <-"); - seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + trace_seq_printf(s, " <-"); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); } - seq_printf(m, "\n"); + trace_seq_printf(s, "\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - seq_printf(m, " %d:%d:%c ==> %d:%d\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio); + trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); break; } } @@ -1108,9 +1173,10 @@ static int s_show(struct seq_file *m, void *v) } } else { if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_lat_fmt(m, iter, iter->idx, iter->cpu); + print_lat_fmt(iter, iter->idx, iter->cpu); else - print_trace_fmt(m, iter); + print_trace_fmt(iter); + trace_print_seq(m, &iter->seq); } return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0ce127455b4..f5b32ca0b45 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -111,11 +111,17 @@ struct tracer { int print_max; }; +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; +}; + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { + struct trace_seq seq; struct trace_array *tr; struct tracer *trace; -- cgit v1.2.3 From b3806b4316306dc9c542eff6c23d7d42918f3504 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: user run time file reading This patch creates a file called trace_pipe in the tracing debug directory. This file is a consumer of the trace buffers. This means that reads of this file consumes the entries from the trace buffers so that they will not be read a second time, as contrast to the static buffers latency_trace and trace. Reading from the trace_pipe will remove the entries from trace and latency_trace too. The advantage that trace_pipe has is that it can record live traces. It will block when there is nothing in the buffer, and read the entries as they are entered. An EOF happens when tracing is disabled (tracing_enabled = 0). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 353 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 2 + 2 files changed, 309 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d39f4faec7c..a40687a4413 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -174,15 +174,20 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; va_list ap; + int ret; if (!len) return 0; va_start(ap, fmt); - len = vsnprintf(s->buffer + s->len, len, fmt, ap); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); va_end(ap); - s->len += len; + /* If we can't write it all, don't bother writing anything */ + if (ret > len) + return 0; + + s->len += ret; return len; } @@ -193,7 +198,7 @@ trace_seq_puts(struct trace_seq *s, const char *str) int len = strlen(str); if (len > ((PAGE_SIZE - 1) - s->len)) - len = (PAGE_SIZE - 1) - s->len; + return 0; memcpy(s->buffer + s->len, str, len); s->len += len; @@ -615,11 +620,13 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, { struct trace_entry *entry; + spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; + spin_unlock(&data->lock); } notrace void @@ -630,6 +637,7 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct trace_entry *entry; + spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -638,6 +646,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + spin_unlock(&data->lock); } enum trace_file_type { @@ -652,7 +661,9 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *array; if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx) + iter->next_idx[cpu] >= data->trace_idx || + (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx)) return NULL; if (!iter->next_page[cpu]) { @@ -702,33 +713,57 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static void *find_next_entry_inc(struct trace_iterator *iter) +static notrace void +trace_iterator_increment(struct trace_iterator *iter) { - struct trace_entry *next; - int next_cpu = -1; + iter->idx++; + iter->next_idx[iter->cpu]++; + iter->next_page_idx[iter->cpu]++; + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - next = find_next_entry(iter, &next_cpu); + iter->next_page_idx[iter->cpu] = 0; + iter->next_page[iter->cpu] = + trace_next_list(data, iter->next_page[iter->cpu]); + } +} - if (next) { - iter->idx++; - iter->next_idx[next_cpu]++; - iter->next_page_idx[next_cpu]++; +static notrace void +trace_consume(struct trace_iterator *iter) +{ + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } - if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[next_cpu]; + /* Check if we empty it, then reset the index */ + if (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx) + data->trace_idx = 0; - iter->next_page_idx[next_cpu] = 0; - iter->next_page[next_cpu] = - trace_next_list(data, iter->next_page[next_cpu]); + trace_iterator_increment(iter); +} + +static notrace void * +find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); - } - } iter->prev_ent = iter->ent; iter->prev_cpu = iter->cpu; iter->ent = next; iter->cpu = next_cpu; + if (next) + trace_iterator_increment(iter); + return next ? iter : NULL; } @@ -815,7 +850,7 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -static void +static int seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS @@ -823,11 +858,12 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) kallsyms_lookup(address, NULL, NULL, NULL, str); - trace_seq_printf(s, fmt, str); + return trace_seq_printf(s, fmt, str); #endif + return 1; } -static void +static int seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -835,8 +871,9 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, char str[KSYM_SYMBOL_LEN]; sprint_symbol(str, address); - trace_seq_printf(s, fmt, str); + return trace_seq_printf(s, fmt, str); #endif + return 1; } #ifndef CONFIG_64BIT @@ -845,21 +882,25 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static notrace void +static notrace int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { - if (!ip) { - trace_seq_printf(s, "0"); - return; - } + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(s, "%s", ip); + ret = seq_print_sym_offset(s, "%s", ip); else - seq_print_sym_short(s, "%s", ip); + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; if (sym_flags & TRACE_ITER_SYM_ADDR) - trace_seq_printf(s, " <" IP_FMT ">", ip); + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; } static notrace void print_lat_help_header(struct seq_file *m) @@ -1089,7 +1130,7 @@ static notrace void sync_time_offset(struct trace_iterator *iter) array->time_offset += prev_t - t; } -static notrace void +static notrace int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1100,6 +1141,7 @@ print_trace_fmt(struct trace_iterator *iter) unsigned long secs; char *comm; int S; + int ret; sync_time_offset(iter); entry = iter->ent; @@ -1110,31 +1152,49 @@ print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "[%02d] ", iter->cpu); - trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); + ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + if (!ret) + return 0; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && entry->fn.parent_ip) { - trace_seq_printf(s, " <-"); - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + ret = trace_seq_printf(s, " <-"); + if (!ret) + return 0; + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); + if (!ret) + return 0; } - trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\n"); + if (!ret) + return 0; break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio); + ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + if (!ret) + return 0; break; } + return 1; } static int trace_empty(struct trace_iterator *iter) @@ -1145,7 +1205,9 @@ static int trace_empty(struct trace_iterator *iter) for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; - if (head_page(data) && data->trace_idx) + if (head_page(data) && data->trace_idx && + (data->trace_tail != data->trace_head || + data->trace_tail_idx != data->trace_head_idx)) return 0; } return 1; @@ -1645,6 +1707,192 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static atomic_t tracing_reader; + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_iterator *iter; + + if (tracing_disabled) + return -ENODEV; + + /* We only allow for reader of the pipe */ + if (atomic_inc_return(&tracing_reader) != 1) { + atomic_dec(&tracing_reader); + return -EBUSY; + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->tr = &global_trace; + + filp->private_data = iter; + + return 0; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + kfree(iter); + atomic_dec(&tracing_reader); + + return 0; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + struct trace_array_cpu *data; + static cpumask_t mask; + struct trace_entry *entry; + static int start; + unsigned long flags; + int read = 0; + int cpu; + int len; + int ret; + + /* return any leftover data */ + if (iter->seq.len > start) { + len = iter->seq.len - start; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt); + if (ret) + cnt = -EFAULT; + + start += len; + + return cnt; + } + + trace_seq_reset(&iter->seq); + start = 0; + + while (trace_empty(iter)) { + /* + * This is a make-shift waitqueue. The reason we don't use + * an actual wait queue is because: + * 1) we only ever have one waiter + * 2) the tracing, traces all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * Anyway, this is really very primitive wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + iter->tr->waiter = current; + + /* sleep for one second, and try again. */ + schedule_timeout(HZ); + + iter->tr->waiter = NULL; + + if (signal_pending(current)) + return -EINTR; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + + continue; + } + + /* stop when tracing is finished */ + if (trace_empty(iter)) + return 0; + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + memset(iter, 0, sizeof(*iter)); + iter->tr = &global_trace; + iter->pos = -1; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + local_irq_save(flags); + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + spin_lock(&data->lock); + cpu_set(cpu, mask); + } + + while ((entry = find_next_entry(iter, &cpu))) { + + if (!entry) + break; + + iter->ent = entry; + iter->cpu = cpu; + + ret = print_trace_fmt(iter); + if (!ret) + break; + + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + + } + + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!cpu_isset(cpu, mask)) + continue; + spin_unlock(&data->lock); + atomic_dec(&data->disabled); + } + local_irq_restore(flags); + + /* Now copy what we have to the user */ + read = iter->seq.len; + if (read > cnt) + read = cnt; + + ret = copy_to_user(ubuf, iter->seq.buffer, read); + + if (read < iter->seq.len) + start = read; + else + trace_seq_reset(&iter->seq); + + if (ret) + read = -EFAULT; + + return read; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -1663,6 +1911,12 @@ static struct file_operations set_tracer_fops = { .write = tracing_set_trace_write, }; +static struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -1763,6 +2017,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'README' entry\n"); + entry = debugfs_create_file("trace_pipe", 0644, d_tracer, + NULL, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, @@ -1816,6 +2075,7 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; + spin_lock_init(&data->lock); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -1823,6 +2083,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; + spin_lock_init(&data->lock); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5b32ca0b45..29a7ea59de5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -55,6 +55,7 @@ struct trace_entry { struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; + spinlock_t lock; cycle_t time_offset; /* these fields get copied into max-trace: */ @@ -88,6 +89,7 @@ struct trace_array { long ctrl; int cpu; cycle_t time_start; + struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; }; -- cgit v1.2.3 From d4c5a2f5870939d837293de87b41dda0012a4572 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: fix locking we can hold all cpu trace buffer locks at once - put each into a separate lock class. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a40687a4413..b3811ca7407 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1865,11 +1865,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } - for_each_possible_cpu(cpu) { + for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - - if (!cpu_isset(cpu, mask)) - continue; spin_unlock(&data->lock); atomic_dec(&data->disabled); } @@ -2076,6 +2073,7 @@ static int trace_alloc_page(void) for_each_possible_cpu(i) { data = global_trace.data[i]; spin_lock_init(&data->lock); + lockdep_set_class(&data->lock, &data->lock_key); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2084,6 +2082,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; spin_lock_init(&data->lock); + lockdep_set_class(&data->lock, &data->lock_key); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2203,5 +2202,4 @@ __init static int tracer_alloc_buffers(void) } return ret; } - fs_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 29a7ea59de5..b0408356f0e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,6 +56,7 @@ struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; spinlock_t lock; + struct lock_class_key lock_key; cycle_t time_offset; /* these fields get copied into max-trace: */ -- cgit v1.2.3 From 4bf39a9411a4ce8712954e03a9bd1592ee345919 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: cleanups no code changed. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 5 +++- kernel/trace/trace.c | 72 +++++++++++++++++++++++++-------------------------- 2 files changed, 39 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5e9389faaf7..97c40865a93 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -756,9 +756,11 @@ ftrace_avail_open(struct inode *inode, struct file *file) ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; + m->private = iter; - } else + } else { kfree(iter); + } return ret; } @@ -770,6 +772,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file) seq_release(inode, file); kfree(iter); + return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b3811ca7407..4550afda960 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1245,10 +1245,10 @@ static int s_show(struct seq_file *m, void *v) } static struct seq_operations tracer_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, }; static struct trace_iterator notrace * @@ -1397,10 +1397,10 @@ static int t_show(struct seq_file *m, void *v) } static struct seq_operations show_traces_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, }; static int show_traces_open(struct inode *inode, struct file *file) @@ -1420,17 +1420,17 @@ static int show_traces_open(struct inode *inode, struct file *file) } static struct file_operations tracing_fops = { - .open = tracing_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_release, + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, }; static struct file_operations tracing_lt_fops = { - .open = tracing_lt_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_release, + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, }; static struct file_operations show_traces_fops = { @@ -1620,8 +1620,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, r = sprintf(buf, "\n"); mutex_unlock(&trace_types_lock); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t @@ -1680,8 +1679,7 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); if (r > 64) r = 64; - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t @@ -1891,27 +1889,27 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } static struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, }; static struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, }; static struct file_operations set_tracer_fops = { - .open = tracing_open_generic, - .read = tracing_set_trace_read, - .write = tracing_set_trace_write, + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, }; static struct file_operations tracing_pipe_fops = { - .open = tracing_open_pipe, - .read = tracing_read_pipe, - .release = tracing_release_pipe, + .open = tracing_open_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1925,13 +1923,13 @@ tracing_read_long(struct file *filp, char __user *ubuf, int r; r = sprintf(buf, "%ld\n", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static struct file_operations tracing_read_long_fops = { - .open = tracing_open_generic, - .read = tracing_read_long, + .open = tracing_open_generic, + .read = tracing_read_long, }; #endif @@ -2033,7 +2031,7 @@ static __init void tracer_init_debugfs(void) /* dummy trace to disable tracing */ static struct tracer no_tracer __read_mostly = { - .name = "none", + .name = "none", }; static int trace_alloc_page(void) -- cgit v1.2.3 From 750ed1a40783432d0dcb0e6c2e813a12615d7664 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: timestamp syncing, prepare rename and uninline now() to ftrace_now(). Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace.h | 5 +---- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_irqsoff.c | 6 +++--- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 4 ++-- 7 files changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97c40865a93..a15e068535f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -531,7 +531,7 @@ static int notrace __ftrace_update_code(void *ignore) save_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; - start = now(raw_smp_processor_id()); + start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; /* No locks needed, the machine is stopped! */ @@ -550,7 +550,7 @@ static int notrace __ftrace_update_code(void *ignore) } - stop = now(raw_smp_processor_id()); + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4550afda960..e3778ab0d3f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,11 @@ ns2usecs(cycle_t nsec) return nsec; } +notrace cycle_t ftrace_now(int cpu) +{ + return cpu_clock(cpu); +} + static atomic_t tracer_counter; static struct trace_array global_trace; @@ -607,7 +612,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) entry->idx = atomic_inc_return(&tracer_counter); entry->preempt_count = pc & 0xff; entry->pid = tsk->pid; - entry->t = now(raw_smp_processor_id()); + entry->t = ftrace_now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b0408356f0e..30cad677e9d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -171,10 +171,7 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -static inline notrace cycle_t now(int cpu) -{ - return cpu_clock(cpu); -} +extern notrace cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER extern void notrace diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5d8ad7a0960..e5d34b78fc9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -20,7 +20,7 @@ static notrace void function_reset(struct trace_array *tr) { int cpu; - tr->time_start = now(tr->cpu); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2dfebb67fdf..d2a6e6f1ad2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -136,7 +136,7 @@ check_critical_timing(struct trace_array *tr, * as long as possible: */ T0 = data->preempt_timestamp; - T1 = now(cpu); + T1 = ftrace_now(cpu); delta = T1-T0; local_save_flags(flags); @@ -186,7 +186,7 @@ out_unlock: out: data->critical_sequence = max_sequence; - data->preempt_timestamp = now(cpu); + data->preempt_timestamp = ftrace_now(cpu); tracing_reset(data); ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); } @@ -215,7 +215,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); data->critical_sequence = max_sequence; - data->preempt_timestamp = now(cpu); + data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; tracing_reset(data); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 6c9284103a6..8d656672da9 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -61,7 +61,7 @@ static notrace void sched_switch_reset(struct trace_array *tr) { int cpu; - tr->time_start = now(tr->cpu); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 688df965f3f..b7df825c3af 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -92,7 +92,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) * as long as possible: */ T0 = data->preempt_timestamp; - T1 = now(cpu); + T1 = ftrace_now(cpu); delta = T1-T0; if (!report_latency(delta)) @@ -191,7 +191,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); - tr->data[wakeup_cpu]->preempt_timestamp = now(cpu); + tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: -- cgit v1.2.3 From 53c37c17aafcf50f7c6fddaf01dda8f9d7e31ddf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: fast, scalable, synchronized timestamps implement globally synchronized, fast and scalable time source for tracing. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3778ab0d3f..9a931c7c2da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,9 +42,61 @@ ns2usecs(cycle_t nsec) return nsec; } +static const int time_sync_freq_max = 128; +static const cycle_t time_sync_thresh = 100000; + +static DEFINE_PER_CPU(cycle_t, time_offset); +static DEFINE_PER_CPU(cycle_t, prev_cpu_time); +static DEFINE_PER_CPU(int, time_sync_count); +static DEFINE_PER_CPU(int, time_sync_freq); + +/* + * Global lock which we take every now and then to synchronize + * the CPUs time. This method is not warp-safe, but it's good + * enough to synchronize slowly diverging time sources and thus + * it's good enough for tracing: + */ +static DEFINE_SPINLOCK(time_sync_lock); +static cycle_t prev_global_time; + +static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&time_sync_lock, flags); + + /* + * Update the synchronization frequency: + */ + if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max) + per_cpu(time_sync_freq, cpu) *= 2; + per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu); + + if (time < prev_global_time) { + per_cpu(time_offset, cpu) += prev_global_time - time; + time = prev_global_time; + } else { + prev_global_time = time; + } + + spin_unlock_irqrestore(&time_sync_lock, flags); + + return time; +} + notrace cycle_t ftrace_now(int cpu) { - return cpu_clock(cpu); + cycle_t prev_cpu_time, time, delta_time; + + prev_cpu_time = per_cpu(prev_cpu_time, cpu); + time = sched_clock() + per_cpu(time_offset, cpu); + delta_time = time-prev_cpu_time; + + if (unlikely(delta_time > time_sync_thresh || + --per_cpu(time_sync_count, cpu) <= 0)) + time = __ftrace_now_sync(time, cpu); + + return time; } static atomic_t tracer_counter; -- cgit v1.2.3 From cdd31cd2d7a0dcbec2cce3974f7129dd4fc8c879 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: remove-idx-sync remove idx syncing - it's expensive on SMP. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 40 +++++----------------------------------- kernel/trace/trace.h | 2 -- 2 files changed, 5 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9a931c7c2da..ce8ceb8aea6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -99,7 +99,6 @@ notrace cycle_t ftrace_now(int cpu) return time; } -static atomic_t tracer_counter; static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); @@ -661,7 +660,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); - entry->idx = atomic_inc_return(&tracer_counter); entry->preempt_count = pc & 0xff; entry->pid = tsk->pid; entry->t = ftrace_now(raw_smp_processor_id()); @@ -757,8 +755,10 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); - if (ent && - (!next || (long)(next->idx - ent->idx) > 0)) { + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ent->t < next->t)) { next = ent; next_cpu = cpu; } @@ -800,8 +800,6 @@ trace_consume(struct trace_iterator *iter) if (data->trace_head == data->trace_tail && data->trace_head_idx == data->trace_tail_idx) data->trace_idx = 0; - - trace_iterator_increment(iter); } static notrace void * @@ -1160,33 +1158,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } } -static notrace void sync_time_offset(struct trace_iterator *iter) -{ - struct trace_array_cpu *prev_array, *array; - struct trace_entry *prev_entry, *entry; - cycle_t prev_t, t; - - entry = iter->ent; - prev_entry = iter->prev_ent; - if (!prev_entry) - return; - - prev_array = iter->tr->data[iter->prev_cpu]; - array = iter->tr->data[iter->cpu]; - - prev_t = prev_entry->t + prev_array->time_offset; - t = entry->t + array->time_offset; - - /* - * If time goes backwards we increase the offset of - * the current array, to not have observable time warps. - * This will quickly synchronize the time offsets of - * multiple CPUs: - */ - if (t < prev_t) - array->time_offset += prev_t - t; -} - static notrace int print_trace_fmt(struct trace_iterator *iter) { @@ -1200,12 +1171,11 @@ print_trace_fmt(struct trace_iterator *iter) int S; int ret; - sync_time_offset(iter); entry = iter->ent; comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset); + t = ns2usecs(entry->t); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 30cad677e9d..27fa2d06f49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,7 +38,6 @@ struct trace_entry { char preempt_count; int pid; cycle_t t; - unsigned long idx; union { struct ftrace_entry fn; struct ctx_switch_entry ctx; @@ -57,7 +56,6 @@ struct trace_array_cpu { atomic_t disabled; spinlock_t lock; struct lock_class_key lock_key; - cycle_t time_offset; /* these fields get copied into max-trace: */ unsigned trace_head_idx; -- cgit v1.2.3 From 8c523a9d82dbc4f3f7d972df8c0f1eacd83d0d55 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: clean-up-pipe-iteration Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ce8ceb8aea6..42f1926acf7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -770,12 +770,12 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static notrace void -trace_iterator_increment(struct trace_iterator *iter) +static notrace void trace_iterator_increment(struct trace_iterator *iter) { iter->idx++; iter->next_idx[iter->cpu]++; iter->next_page_idx[iter->cpu]++; + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -785,8 +785,7 @@ trace_iterator_increment(struct trace_iterator *iter) } } -static notrace void -trace_consume(struct trace_iterator *iter) +static notrace void trace_consume(struct trace_iterator *iter) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -802,8 +801,7 @@ trace_consume(struct trace_iterator *iter) data->trace_idx = 0; } -static notrace void * -find_next_entry_inc(struct trace_iterator *iter) +static notrace void *find_next_entry_inc(struct trace_iterator *iter) { struct trace_entry *next; int next_cpu = -1; @@ -1871,14 +1869,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpu_set(cpu, mask); } - while ((entry = find_next_entry(iter, &cpu))) { - - if (!entry) - break; - - iter->ent = entry; - iter->cpu = cpu; - + while ((entry = find_next_entry_inc(iter)) != NULL) { ret = print_trace_fmt(iter); if (!ret) break; @@ -1887,7 +1878,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (iter->seq.len >= cnt) break; - } for_each_cpu_mask(cpu, mask) { -- cgit v1.2.3 From f9896bf30928922a3913a3810a4ab7908da6cfe7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add raw output Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 63 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42f1926acf7..bebd263f582 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -153,6 +153,7 @@ enum trace_iterator_flags { TRACE_ITER_SYM_OFFSET = 0x02, TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, }; #define TRACE_ITER_SYM_MASK \ @@ -164,6 +165,7 @@ static const char *trace_options[] = { "sym-offset", "sym-addr", "verbose", + "raw", NULL }; @@ -1099,7 +1101,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static notrace void +static notrace int print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; @@ -1154,10 +1156,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } + return 1; } -static notrace int -print_trace_fmt(struct trace_iterator *iter) +static notrace int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1222,6 +1224,43 @@ print_trace_fmt(struct trace_iterator *iter) return 1; } +static notrace int print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + int ret; + int S; + + entry = iter->ent; + + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, entry->t); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = trace_seq_printf(s, "%x %x\n", + entry->fn.ip, entry->fn.parent_ip); + if (!ret) + return 0; + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + ret = trace_seq_printf(s, "%d %d %c %d %d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + if (!ret) + return 0; + break; + } + return 1; +} + static int trace_empty(struct trace_iterator *iter) { struct trace_array_cpu *data; @@ -1238,6 +1277,17 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +static int print_trace_line(struct trace_iterator *iter) +{ + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + return print_lat_fmt(iter, iter->idx, iter->cpu); + + return print_trace_fmt(iter); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -1259,10 +1309,7 @@ static int s_show(struct seq_file *m, void *v) print_func_help_header(m); } } else { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_lat_fmt(iter, iter->idx, iter->cpu); - else - print_trace_fmt(iter); + print_trace_line(iter); trace_print_seq(m, &iter->seq); } @@ -1870,7 +1917,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } while ((entry = find_next_entry_inc(iter)) != NULL) { - ret = print_trace_fmt(iter); + ret = print_trace_line(iter); if (!ret) break; -- cgit v1.2.3 From cb0f12aae8d085140d37ada351aa5a8e76c3f9b0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: bin-output Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bebd263f582..d78cbc4fc51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -154,6 +154,7 @@ enum trace_iterator_flags { TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, TRACE_ITER_RAW = 0x10, + TRACE_ITER_BIN = 0x20, }; #define TRACE_ITER_SYM_MASK \ @@ -166,6 +167,7 @@ static const char *trace_options[] = { "sym-addr", "verbose", "raw", + "bin", NULL }; @@ -275,6 +277,18 @@ trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } +static notrace int +trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + static notrace void trace_seq_reset(struct trace_seq *s) { @@ -1261,6 +1275,39 @@ static notrace int print_raw_fmt(struct trace_iterator *iter) return 1; } +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static notrace int print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + + entry = iter->ent; + + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + break; + } + return 1; +} + static int trace_empty(struct trace_iterator *iter) { struct trace_array_cpu *data; @@ -1279,6 +1326,9 @@ static int trace_empty(struct trace_iterator *iter) static int print_trace_line(struct trace_iterator *iter) { + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); -- cgit v1.2.3 From f0a920d5752e1788c0cba2add103076bcc0f7a49 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add trace_special() for ad-hoc tracing. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 15 +++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d78cbc4fc51..fa13059eb46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -137,6 +137,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, + TRACE_SPECIAL, __TRACE_LAST_TYPE }; @@ -700,6 +701,22 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock(&data->lock); } +notrace void +trace_special(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_entry *entry; + + spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + spin_unlock(&data->lock); +} + notrace void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, @@ -1167,6 +1184,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) entry->ctx.next_prio, comm); break; + case TRACE_SPECIAL: + trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1234,6 +1257,14 @@ static notrace int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; } return 1; } @@ -1271,6 +1302,14 @@ static notrace int print_raw_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; } return 1; } @@ -1304,6 +1343,11 @@ static notrace int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); break; + case TRACE_SPECIAL: + SEQ_PUT_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_FIELD_RET(s, entry->special.arg3); + break; } return 1; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 27fa2d06f49..7bdfef35c05 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -25,6 +25,15 @@ struct ctx_switch_entry { unsigned char next_prio; }; +/* + * Special (free-form) trace entry: + */ +struct special_entry { + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +}; + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -41,6 +50,7 @@ struct trace_entry { union { struct ftrace_entry fn; struct ctx_switch_entry ctx; + struct special_entry special; }; }; @@ -154,6 +164,11 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags); void tracing_record_cmdline(struct task_struct *tsk); +void trace_special(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3); void tracing_start_function_trace(void); void tracing_stop_function_trace(void); -- cgit v1.2.3 From dcb6308f2b56720599f6b9d5a01c33e67e69bde4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace, locking fix should be an irq-safe lock ... Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa13059eb46..70f94fa92c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -691,14 +691,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } notrace void @@ -706,15 +707,16 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_SPECIAL; entry->special.arg1 = arg1; entry->special.arg2 = arg2; entry->special.arg3 = arg3; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } notrace void @@ -724,8 +726,9 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -734,7 +737,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } enum trace_file_type { -- cgit v1.2.3 From 088b1e427dbba2af93cb6a7d39258c10ff58dd27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: pipe fixes Some fixes for better output with the trace pipe. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70f94fa92c1..c56fc5e6013 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -770,11 +770,6 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, array = page_address(page); - /* Still possible to catch up to the tail */ - if (iter->next_idx[cpu] && array == data->trace_tail && - iter->next_page_idx[cpu] == data->trace_tail_idx) - return NULL; - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); return &array[iter->next_page_idx[cpu]]; } @@ -1921,7 +1916,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; static cpumask_t mask; - struct trace_entry *entry; static int start; unsigned long flags; int read = 0; @@ -2013,10 +2007,15 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpu_set(cpu, mask); } - while ((entry = find_next_entry_inc(iter)) != NULL) { + while (find_next_entry_inc(iter) != NULL) { + int len = iter->seq.len; + ret = print_trace_line(iter); - if (!ret) + if (!ret) { + /* don't print partial lines */ + iter->seq.len = len; break; + } trace_consume(iter); -- cgit v1.2.3 From 37ad508419f0fdfda7b378756eb1f35cfd26d96d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace - fix dynamic ftrace memory leak The ftrace dynamic function update allocates a record to store the instruction pointers that are being modified. If the modified instruction pointer fails to update, then the record is marked as failed and nothing more is done. Worse, if the modification fails, but the record ip function is still called, it will allocate a new record and try again. In just a matter of time, will this cause a serious memory leak and crash the system. This patch plugs this memory leak. When a record fails, it is included back into the pool of records to be used. Now a record may fail over and over again, but the number of allocated records will not increase. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a15e068535f..8e02aa690b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -188,6 +188,8 @@ static int ftraced_suspend; static int ftrace_record_suspend; +static struct dyn_ftrace *ftrace_free_records; + static inline int notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) { @@ -211,8 +213,35 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } +static notrace void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + /* todo, disable tracing altogether on this warning */ + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -356,8 +385,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } } static void notrace ftrace_replace_code(int enable) @@ -407,8 +444,10 @@ ftrace_code_disable(struct dyn_ftrace *rec) call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); - if (failed) + if (failed) { rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } } static int notrace __ftrace_modify_code(void *data) -- cgit v1.2.3 From 4eebcc81a33fbc45e28542b50197ed7b3c486d90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: disable tracing on failure Since ftrace touches practically every function. If we detect any anomaly, we want to fully disable ftrace. This patch adds code to try shutdown ftrace as much as possible without doing any more harm is something is detected not quite correct. This only kills ftrace, this patch does have checks for other parts of the tracer (irqsoff, wakeup, etc.). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 112 ++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_selftest.c | 4 ++ 2 files changed, 107 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e02aa690b2..ff42345dd78 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,9 +29,16 @@ #include "trace.h" -int ftrace_enabled; +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -230,10 +237,11 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) if (ftrace_free_records) { rec = ftrace_free_records; - /* todo, disable tracing altogether on this warning */ if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { WARN_ON_ONCE(1); ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; return NULL; } @@ -260,7 +268,7 @@ ftrace_record_ip(unsigned long ip) int resched; int atomic; - if (!ftrace_enabled) + if (!ftrace_enabled || ftrace_disabled) return; resched = need_resched(); @@ -485,6 +493,9 @@ static void notrace ftrace_startup(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend++; if (ftraced_suspend == 1) @@ -507,6 +518,9 @@ static void notrace ftrace_shutdown(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend--; if (!ftraced_suspend) @@ -529,6 +543,9 @@ static void notrace ftrace_startup_sysctl(void) { int command = FTRACE_ENABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* Force update next time */ saved_ftrace_func = NULL; @@ -544,6 +561,9 @@ static void notrace ftrace_shutdown_sysctl(void) { int command = FTRACE_DISABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) @@ -600,6 +620,9 @@ static int notrace __ftrace_update_code(void *ignore) static void notrace ftrace_update_code(void) { + if (unlikely(ftrace_disabled)) + return; + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); } @@ -614,6 +637,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + if (unlikely(ftrace_disabled)) + continue; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { @@ -628,6 +654,7 @@ static int notrace ftraced(void *ignore) ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; WARN_ON_ONCE(1); } ftraced_trigger = 0; @@ -785,6 +812,9 @@ ftrace_avail_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -843,6 +873,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret = 0; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -1063,6 +1096,9 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) { + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftrace_filter_lock); if (reset) ftrace_filter_reset(); @@ -1133,7 +1169,7 @@ int ftrace_force_update(void) DECLARE_WAITQUEUE(wait, current); int ret = 0; - if (!ftraced_task) + if (unlikely(ftrace_disabled)) return -ENODEV; mutex_lock(&ftraced_lock); @@ -1142,6 +1178,11 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&ftraced_waiters, &wait); + if (unlikely(!ftraced_task)) { + ret = -ENODEV; + goto out; + } + do { mutex_unlock(&ftraced_lock); wake_up_process(ftraced_task); @@ -1154,6 +1195,7 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); } while (last_counter == ftraced_iteration_counter); + out: mutex_unlock(&ftraced_lock); remove_wait_queue(&ftraced_waiters, &wait); set_current_state(TASK_RUNNING); @@ -1161,6 +1203,22 @@ int ftrace_force_update(void) return ret; } +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1194,21 +1252,29 @@ static int __init notrace ftrace_dynamic_init(void) stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) - return addr; + if (addr) { + ret = (int)addr; + goto failed; + } ret = ftrace_dyn_table_alloc(); if (ret) - return ret; + goto failed; p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) - return -1; + if (IS_ERR(p)) { + ret = -1; + goto failed; + } last_ftrace_enabled = ftrace_enabled = 1; ftraced_task = p; return 0; + + failed: + ftrace_disabled = 1; + return ret; } core_initcall(ftrace_dynamic_init); @@ -1217,8 +1283,30 @@ core_initcall(ftrace_dynamic_init); # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +/** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + /** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. @@ -1234,6 +1322,9 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; + if (unlikely(ftrace_disabled)) + return -1; + mutex_lock(&ftrace_sysctl_lock); ret = __register_ftrace_function(ops); ftrace_startup(); @@ -1267,6 +1358,9 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, { int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); ret = proc_dointvec(table, write, file, buffer, lenp, ppos); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a6f1ed75f83..85715b86a34 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -248,6 +248,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + return ret; } #endif /* CONFIG_FTRACE */ -- cgit v1.2.3 From 26994ead1fc8cced63f17e9848edc1771036664e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: enabled tracing by default This patch is the correct way to have tracing enabled by default. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c56fc5e6013..3dc6eac85b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -107,7 +107,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); -static int tracer_enabled; +static int tracer_enabled = 1; static unsigned long trace_nr_entries = 16384UL; static struct tracer *trace_types __read_mostly; @@ -2268,6 +2268,8 @@ __init static int tracer_alloc_buffers(void) int ret = -ENOMEM; int i; + global_trace.ctrl = tracer_enabled; + /* Allocate the first page for all buffers */ for_each_possible_cpu(i) { data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); -- cgit v1.2.3 From 0fd9e0dac9026df09986a4b201518ae015814aef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: use cpu clock again Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 54 +--------------------------------------------------- 1 file changed, 1 insertion(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3dc6eac85b1..d74c039305a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,61 +42,9 @@ ns2usecs(cycle_t nsec) return nsec; } -static const int time_sync_freq_max = 128; -static const cycle_t time_sync_thresh = 100000; - -static DEFINE_PER_CPU(cycle_t, time_offset); -static DEFINE_PER_CPU(cycle_t, prev_cpu_time); -static DEFINE_PER_CPU(int, time_sync_count); -static DEFINE_PER_CPU(int, time_sync_freq); - -/* - * Global lock which we take every now and then to synchronize - * the CPUs time. This method is not warp-safe, but it's good - * enough to synchronize slowly diverging time sources and thus - * it's good enough for tracing: - */ -static DEFINE_SPINLOCK(time_sync_lock); -static cycle_t prev_global_time; - -static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu) -{ - unsigned long flags; - - spin_lock_irqsave(&time_sync_lock, flags); - - /* - * Update the synchronization frequency: - */ - if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max) - per_cpu(time_sync_freq, cpu) *= 2; - per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu); - - if (time < prev_global_time) { - per_cpu(time_offset, cpu) += prev_global_time - time; - time = prev_global_time; - } else { - prev_global_time = time; - } - - spin_unlock_irqrestore(&time_sync_lock, flags); - - return time; -} - notrace cycle_t ftrace_now(int cpu) { - cycle_t prev_cpu_time, time, delta_time; - - prev_cpu_time = per_cpu(prev_cpu_time, cpu); - time = sched_clock() + per_cpu(time_offset, cpu); - delta_time = time-prev_cpu_time; - - if (unlikely(delta_time > time_sync_thresh || - --per_cpu(time_sync_count, cpu) <= 0)) - time = __ftrace_now_sync(time, cpu); - - return time; + return cpu_clock(cpu); } static struct trace_array global_trace; -- cgit v1.2.3 From 2e0f57618529a2739a5e1570e6c445c9c966b595 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 111 ++++++++++++++++++++++++----------------- kernel/trace/trace_functions.c | 2 +- 2 files changed, 67 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d74c039305a..71b25b79b3d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -432,47 +432,6 @@ notrace void tracing_reset(struct trace_array_cpu *data) data->trace_tail_idx = 0; } -#ifdef CONFIG_FTRACE -static notrace void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (unlikely(!tracer_enabled)) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - ftrace(tr, data, ip, parent_ip, flags); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, -}; -#endif - -notrace void tracing_start_function_trace(void) -{ - register_ftrace_function(&trace_ops); -} - -notrace void tracing_stop_function_trace(void) -{ - unregister_ftrace_function(&trace_ops); -} - #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; @@ -635,8 +594,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) } notrace void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) +__ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; unsigned long irq_flags; @@ -650,6 +609,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); } +notrace void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + if (likely(!atomic_read(&data->disabled))) + __ftrace(tr, data, ip, parent_ip, flags); +} + notrace void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) @@ -688,6 +655,47 @@ tracing_sched_switch_trace(struct trace_array *tr, spin_unlock_irqrestore(&data->lock, irq_flags); } +#ifdef CONFIG_FTRACE +static notrace void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +notrace void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +notrace void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} +#endif + enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; @@ -722,7 +730,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return &array[iter->next_page_idx[cpu]]; } -static struct notrace trace_entry * +static struct trace_entry * notrace find_next_entry(struct trace_iterator *iter, int *ent_cpu) { struct trace_array *tr = iter->tr; @@ -1866,6 +1874,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, static cpumask_t mask; static int start; unsigned long flags; + int ftrace_save; int read = 0; int cpu; int len; @@ -1944,6 +1953,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpus_clear(mask); local_irq_save(flags); + ftrace_save = ftrace_enabled; + ftrace_enabled = 0; + smp_wmb(); for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; @@ -1951,10 +1963,14 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, continue; atomic_inc(&data->disabled); - spin_lock(&data->lock); cpu_set(cpu, mask); } + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; + spin_lock(&data->lock); + } + while (find_next_entry_inc(iter) != NULL) { int len = iter->seq.len; @@ -1974,8 +1990,13 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; spin_unlock(&data->lock); + } + + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; atomic_dec(&data->disabled); } + ftrace_enabled = ftrace_save; local_irq_restore(flags); /* Now copy what we have to the user */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e5d34b78fc9..69a0eb00a0a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -9,10 +9,10 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ -#include #include #include #include +#include #include "trace.h" -- cgit v1.2.3 From 5e3ca0ec76fce92daa4eed0d02de9c79b1fe3920 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: introduce the "hex" output method Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71b25b79b3d..6974b212e93 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -103,7 +103,8 @@ enum trace_iterator_flags { TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, TRACE_ITER_RAW = 0x10, - TRACE_ITER_BIN = 0x20, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, }; #define TRACE_ITER_SYM_MASK \ @@ -116,6 +117,7 @@ static const char *trace_options[] = { "sym-addr", "verbose", "raw", + "hex", "bin", NULL }; @@ -238,6 +240,47 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } +#define HEX_CHARS 17 + +static notrace int +trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data; + unsigned char byte; + int i, j; + + BUG_ON(len >= HEX_CHARS); + + data = mem; + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + byte = data[i]; + + hex[j] = byte & 0x0f; + if (hex[j] >= 10) + hex[j] += 'a' - 10; + else + hex[j] += '0'; + j++; + + hex[j] = byte >> 4; + if (hex[j] >= 10) + hex[j] += 'a' - 10; + else + hex[j] += '0'; + j++; + } + hex[j] = ' '; + j++; + + return trace_seq_putmem(s, hex, j); +} + static notrace void trace_seq_reset(struct trace_seq *s) { @@ -1274,6 +1317,51 @@ do { \ return 0; \ } while (0) +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static notrace int print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + int S; + + entry = iter->ent; + + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_SPECIAL: + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + break; + } + SEQ_PUT_FIELD_RET(s, newline); + + return 1; +} + static notrace int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1327,6 +1415,9 @@ static int print_trace_line(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); -- cgit v1.2.3 From 2577046740fe6d77864128c6187c11125c2449ea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix no need to backmerge, only affects ftrace-enabled kernels. (which is not the default) Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6974b212e93..958c4d77a67 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1965,7 +1965,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, static cpumask_t mask; static int start; unsigned long flags; +#ifdef CONFIG_FTRACE int ftrace_save; +#endif int read = 0; int cpu; int len; @@ -2044,8 +2046,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpus_clear(mask); local_irq_save(flags); +#ifdef CONFIG_FTRACE ftrace_save = ftrace_enabled; ftrace_enabled = 0; +#endif smp_wmb(); for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; @@ -2087,7 +2091,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, data = iter->tr->data[cpu]; atomic_dec(&data->disabled); } +#ifdef CONFIG_FTRACE ftrace_enabled = ftrace_save; +#endif local_irq_restore(flags); /* Now copy what we have to the user */ -- cgit v1.2.3 From 2a2cc8f7c4d0dfd75720867f7dc58d24f075edfc Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Pedersen Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: allow the event pipe to be polled Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 958c4d77a67..d041578affd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,7 @@ static struct tracer *current_trace __read_mostly; static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); +static DECLARE_WAIT_QUEUE_HEAD (trace_wait); #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) @@ -105,6 +107,7 @@ enum trace_iterator_flags { TRACE_ITER_RAW = 0x10, TRACE_ITER_HEX = 0x20, TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, }; #define TRACE_ITER_SYM_MASK \ @@ -119,6 +122,7 @@ static const char *trace_options[] = { "raw", "hex", "bin", + "block", NULL }; @@ -650,6 +654,9 @@ __ftrace(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } notrace void @@ -675,6 +682,9 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } notrace void @@ -696,6 +706,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } #ifdef CONFIG_FTRACE @@ -1765,7 +1778,6 @@ static struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; - static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1953,6 +1965,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } + else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + /* * Consumer reader. */ @@ -1991,6 +2025,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { + if (!(trace_flags & TRACE_ITER_BLOCK)) + return -EWOULDBLOCK; /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: @@ -2134,6 +2170,7 @@ static struct file_operations set_tracer_fops = { static struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, + .poll = tracing_poll_pipe, .read = tracing_read_pipe, .release = tracing_release_pipe, }; -- cgit v1.2.3 From 6fb44b717c10ecf37beaaebd312f3afa93fed714 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: add trace_function api for other tracers to use A new check was added in the ftrace function that wont trace if the CPU trace buffer is disabled. Unfortunately, other tracers used ftrace() to write to the buffer after they disabled it. The new disable check makes these calls into a nop. This patch changes the __ftrace that is called without the check into a new api for the other tracers to use, called "trace_function". The other tracers use this interface instead when the trace CPU buffer is already disabled. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace.h | 5 +++++ kernel/trace/trace_irqsoff.c | 10 +++++----- kernel/trace/trace_sched_wakeup.c | 5 +++-- 4 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d041578affd..9022c357032 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -641,8 +641,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) } notrace void -__ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) +trace_function(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; unsigned long irq_flags; @@ -664,7 +664,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { if (likely(!atomic_read(&data->disabled))) - __ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); } notrace void @@ -730,7 +730,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - __ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); local_irq_restore(flags); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7bdfef35c05..faf9f67246a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -169,6 +169,11 @@ void trace_special(struct trace_array *tr, unsigned long arg1, unsigned long arg2, unsigned long arg3); +void trace_function(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); void tracing_start_function_trace(void); void tracing_stop_function_trace(void); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d2a6e6f1ad2..3269f4ff517 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); } @@ -150,7 +150,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); latency = nsecs_to_usecs(delta); @@ -188,7 +188,7 @@ out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); tracing_reset(data); - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); } static inline void notrace @@ -221,7 +221,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); __get_cpu_var(tracing_cpu) = 1; @@ -254,7 +254,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b7df825c3af..3549e4154f1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -85,7 +85,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); /* * usecs conversion is slow so we try to delay the conversion @@ -192,7 +192,8 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: spin_unlock(&wakeup_lock); -- cgit v1.2.3 From 9ff9cdb2d3b0971f89e899b3420aadd91bddc215 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:50 +0200 Subject: ftrace: cleanups clean up recent code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 5 +++-- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_selftest.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ff42345dd78..425b1fec3d8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -197,8 +197,8 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; -static inline int -notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +static inline int notrace +ftrace_ip_in_hash(unsigned long ip, unsigned long key) { struct dyn_ftrace *p; struct hlist_node *t; @@ -1249,6 +1249,7 @@ static int __init notrace ftrace_dynamic_init(void) int ret; addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); /* ftrace_dyn_arch_init places the return code in addr */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3269f4ff517..2ac0d09db6f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -309,7 +309,7 @@ void trace_softirqs_off(unsigned long ip) { } -inline void print_irqtrace_events(struct task_struct *curr) +inline notrace void print_irqtrace_events(struct task_struct *curr) { } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 85715b86a34..546307de6e3 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,7 +3,7 @@ #include #include -static inline int trace_valid_entry(struct trace_entry *entry) +static notrace inline int trace_valid_entry(struct trace_entry *entry) { switch (entry->type) { case TRACE_FN: -- cgit v1.2.3 From caf8cdebfb6c1cff50ea8077f1a07c2333d6d1fd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:50 +0200 Subject: ftrace: remove address of function names PowerPC is very fragile when it comes to use of function names and function addresses. ftrace needs to either use all function addresses or function names (i.e. my_func as suppose to &my_func). This patch chooses to use the names and not the addresses, and makes ftrace consistent. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 425b1fec3d8..57350cbd1f6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,8 +326,8 @@ ftrace_record_ip(unsigned long ip) preempt_enable_notrace(); } -#define FTRACE_ADDR ((long)(&ftrace_caller)) -#define MCOUNT_ADDR ((long)(&mcount)) +#define FTRACE_ADDR ((long)(ftrace_caller)) +#define MCOUNT_ADDR ((long)(mcount)) static void notrace __ftrace_replace_code(struct dyn_ftrace *rec, -- cgit v1.2.3 From b53dde9d34f2df396540988ebc65c33400f57b04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: disable -pg for the tracer itself Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3fec653d653..c25a6cd6a52 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,11 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FTRACE +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +endif + obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o -- cgit v1.2.3 From e309b41dd65aa953f86765eeeecc941d8e1e8b8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: remove notrace now that we have a kbuild method for notrace, no need to pollute the C code with the annotations. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 66 +++++++++++++-------------- kernel/trace/trace.c | 94 +++++++++++++++++++-------------------- kernel/trace/trace.h | 6 +-- kernel/trace/trace_functions.c | 12 ++--- kernel/trace/trace_irqsoff.c | 40 ++++++++--------- kernel/trace/trace_sched_switch.c | 12 ++--- kernel/trace/trace_sched_wakeup.c | 28 ++++++------ kernel/trace/trace_selftest.c | 2 +- 8 files changed, 130 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57350cbd1f6..281d97a3208 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -53,7 +53,7 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; /* mcount is defined per arch in assembly */ EXPORT_SYMBOL(mcount); -notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -79,7 +79,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static int notrace __register_ftrace_function(struct ftrace_ops *ops) +static int __register_ftrace_function(struct ftrace_ops *ops) { /* Should never be called by interrupts */ spin_lock(&ftrace_lock); @@ -110,7 +110,7 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) +static int __unregister_ftrace_function(struct ftrace_ops *ops) { struct ftrace_ops **p; int ret = 0; @@ -197,7 +197,7 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; -static inline int notrace +static inline int ftrace_ip_in_hash(unsigned long ip, unsigned long key) { struct dyn_ftrace *p; @@ -214,13 +214,13 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) return found; } -static inline void notrace +static inline void ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) { hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace void ftrace_free_rec(struct dyn_ftrace *rec) +static void ftrace_free_rec(struct dyn_ftrace *rec) { /* no locking, only called from kstop_machine */ @@ -229,7 +229,7 @@ static notrace void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -259,7 +259,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) return &ftrace_pages->records[ftrace_pages->index++]; } -static void notrace +static void ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; @@ -329,7 +329,7 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(ftrace_caller)) #define MCOUNT_ADDR ((long)(mcount)) -static void notrace +static void __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { @@ -405,7 +405,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } -static void notrace ftrace_replace_code(int enable) +static void ftrace_replace_code(int enable) { unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; @@ -430,7 +430,7 @@ static void notrace ftrace_replace_code(int enable) } } -static notrace void ftrace_shutdown_replenish(void) +static void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) return; @@ -439,7 +439,7 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static notrace void +static void ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; @@ -458,7 +458,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) } } -static int notrace __ftrace_modify_code(void *data) +static int __ftrace_modify_code(void *data) { unsigned long addr; int *command = data; @@ -482,14 +482,14 @@ static int notrace __ftrace_modify_code(void *data) return 0; } -static void notrace ftrace_run_update_code(int command) +static void ftrace_run_update_code(int command) { stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } static ftrace_func_t saved_ftrace_func; -static void notrace ftrace_startup(void) +static void ftrace_startup(void) { int command = 0; @@ -514,7 +514,7 @@ static void notrace ftrace_startup(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_shutdown(void) +static void ftrace_shutdown(void) { int command = 0; @@ -539,7 +539,7 @@ static void notrace ftrace_shutdown(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_startup_sysctl(void) +static void ftrace_startup_sysctl(void) { int command = FTRACE_ENABLE_MCOUNT; @@ -557,7 +557,7 @@ static void notrace ftrace_startup_sysctl(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_shutdown_sysctl(void) +static void ftrace_shutdown_sysctl(void) { int command = FTRACE_DISABLE_MCOUNT; @@ -577,7 +577,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int notrace __ftrace_update_code(void *ignore) +static int __ftrace_update_code(void *ignore) { struct dyn_ftrace *p; struct hlist_head head; @@ -618,7 +618,7 @@ static int notrace __ftrace_update_code(void *ignore) return 0; } -static void notrace ftrace_update_code(void) +static void ftrace_update_code(void) { if (unlikely(ftrace_disabled)) return; @@ -626,7 +626,7 @@ static void notrace ftrace_update_code(void) stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); } -static int notrace ftraced(void *ignore) +static int ftraced(void *ignore) { unsigned long usecs; @@ -733,7 +733,7 @@ struct ftrace_iterator { unsigned filtered; }; -static void notrace * +static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; @@ -806,7 +806,7 @@ static struct seq_operations show_ftrace_seq_ops = { .show = t_show, }; -static int notrace +static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; @@ -845,7 +845,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } -static void notrace ftrace_filter_reset(void) +static void ftrace_filter_reset(void) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -867,7 +867,7 @@ static void notrace ftrace_filter_reset(void) preempt_enable(); } -static int notrace +static int ftrace_filter_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; @@ -903,7 +903,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) return ret; } -static ssize_t notrace +static ssize_t ftrace_filter_read(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -913,7 +913,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf, return -EPERM; } -static loff_t notrace +static loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -933,7 +933,7 @@ enum { MATCH_END_ONLY, }; -static void notrace +static void ftrace_match(unsigned char *buff, int len) { char str[KSYM_SYMBOL_LEN]; @@ -1002,7 +1002,7 @@ ftrace_match(unsigned char *buff, int len) preempt_enable(); } -static ssize_t notrace +static ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1094,7 +1094,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) +void ftrace_set_filter(unsigned char *buf, int len, int reset) { if (unlikely(ftrace_disabled)) return; @@ -1107,7 +1107,7 @@ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) mutex_unlock(&ftrace_filter_lock); } -static int notrace +static int ftrace_filter_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; @@ -1242,7 +1242,7 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); -static int __init notrace ftrace_dynamic_init(void) +static int __init ftrace_dynamic_init(void) { struct task_struct *p; unsigned long addr; @@ -1352,7 +1352,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) return ret; } -notrace int +int ftrace_enable_sysctl(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9022c357032..f5898051fdd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,7 +35,7 @@ unsigned long __read_mostly tracing_thresh; static int tracing_disabled = 1; -static long notrace +static long ns2usecs(cycle_t nsec) { nsec += 500; @@ -43,7 +43,7 @@ ns2usecs(cycle_t nsec) return nsec; } -notrace cycle_t ftrace_now(int cpu) +cycle_t ftrace_now(int cpu) { return cpu_clock(cpu); } @@ -135,7 +135,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock); * structure. (this way the maximum trace is permanently saved, * for later retrieval via /debugfs/tracing/latency_trace) */ -static notrace void +static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -184,7 +184,7 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } -static notrace int +static int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; @@ -207,7 +207,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } -static notrace int +static int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); @@ -221,7 +221,7 @@ trace_seq_puts(struct trace_seq *s, const char *str) return len; } -static notrace int +static int trace_seq_putc(struct trace_seq *s, unsigned char c) { if (s->len >= (PAGE_SIZE - 1)) @@ -232,7 +232,7 @@ trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } -static notrace int +static int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) { if (len > ((PAGE_SIZE - 1) - s->len)) @@ -246,7 +246,7 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) #define HEX_CHARS 17 -static notrace int +static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; @@ -285,13 +285,13 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } -static notrace void +static void trace_seq_reset(struct trace_seq *s) { s->len = 0; } -static notrace void +static void trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; @@ -302,7 +302,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } -notrace static void +static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { struct list_head flip_pages; @@ -323,7 +323,7 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) check_pages(tr2); } -notrace void +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; @@ -348,7 +348,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tsk - task with the latency * @cpu - the cpu of the buffer to copy. */ -notrace void +void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -471,7 +471,7 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -notrace void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; data->trace_head = data->trace_tail = head_page(data); @@ -494,9 +494,9 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } -notrace void trace_stop_cmdline_recording(void); +void trace_stop_cmdline_recording(void); -static notrace void trace_save_cmdline(struct task_struct *tsk) +static void trace_save_cmdline(struct task_struct *tsk) { unsigned map; unsigned idx; @@ -531,7 +531,7 @@ static notrace void trace_save_cmdline(struct task_struct *tsk) spin_unlock(&trace_cmdline_lock); } -static notrace char *trace_find_cmdline(int pid) +static char *trace_find_cmdline(int pid) { char *cmdline = "<...>"; unsigned map; @@ -552,7 +552,7 @@ static notrace char *trace_find_cmdline(int pid) return cmdline; } -notrace void tracing_record_cmdline(struct task_struct *tsk) +void tracing_record_cmdline(struct task_struct *tsk) { if (atomic_read(&trace_record_cmdline_disabled)) return; @@ -560,7 +560,7 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } -static inline notrace struct list_head * +static inline struct list_head * trace_next_list(struct trace_array_cpu *data, struct list_head *next) { /* @@ -574,7 +574,7 @@ trace_next_list(struct trace_array_cpu *data, struct list_head *next) return next; } -static inline notrace void * +static inline void * trace_next_page(struct trace_array_cpu *data, void *addr) { struct list_head *next; @@ -588,7 +588,7 @@ trace_next_page(struct trace_array_cpu *data, void *addr) return page_address(page); } -static inline notrace struct trace_entry * +static inline struct trace_entry * tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; @@ -623,7 +623,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) return entry; } -static inline notrace void +static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; @@ -640,7 +640,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } -notrace void +void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { @@ -659,7 +659,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, wake_up (&trace_wait); } -notrace void +void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { @@ -667,7 +667,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -notrace void +void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { @@ -687,7 +687,7 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, wake_up (&trace_wait); } -notrace void +void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, @@ -712,7 +712,7 @@ tracing_sched_switch_trace(struct trace_array *tr, } #ifdef CONFIG_FTRACE -static notrace void +static void function_trace_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; @@ -741,12 +741,12 @@ static struct ftrace_ops trace_ops __read_mostly = .func = function_trace_call, }; -notrace void tracing_start_function_trace(void) +void tracing_start_function_trace(void) { register_ftrace_function(&trace_ops); } -notrace void tracing_stop_function_trace(void) +void tracing_stop_function_trace(void) { unregister_ftrace_function(&trace_ops); } @@ -786,7 +786,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return &array[iter->next_page_idx[cpu]]; } -static struct trace_entry * notrace +static struct trace_entry * find_next_entry(struct trace_iterator *iter, int *ent_cpu) { struct trace_array *tr = iter->tr; @@ -813,7 +813,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static notrace void trace_iterator_increment(struct trace_iterator *iter) +static void trace_iterator_increment(struct trace_iterator *iter) { iter->idx++; iter->next_idx[iter->cpu]++; @@ -828,7 +828,7 @@ static notrace void trace_iterator_increment(struct trace_iterator *iter) } } -static notrace void trace_consume(struct trace_iterator *iter) +static void trace_consume(struct trace_iterator *iter) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -844,7 +844,7 @@ static notrace void trace_consume(struct trace_iterator *iter) data->trace_idx = 0; } -static notrace void *find_next_entry_inc(struct trace_iterator *iter) +static void *find_next_entry_inc(struct trace_iterator *iter) { struct trace_entry *next; int next_cpu = -1; @@ -863,7 +863,7 @@ static notrace void *find_next_entry_inc(struct trace_iterator *iter) return next ? iter : NULL; } -static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos) +static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; void *last_ent = iter->ent; @@ -978,7 +978,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static notrace int +static int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { int ret; @@ -999,7 +999,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static notrace void print_lat_help_header(struct seq_file *m) +static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); seq_puts(m, "# / _-----=> irqs-off \n"); @@ -1012,14 +1012,14 @@ static notrace void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static notrace void print_func_help_header(struct seq_file *m) +static void print_func_help_header(struct seq_file *m) { seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static notrace void +static void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1090,7 +1090,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static notrace void +static void lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; @@ -1127,7 +1127,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; -static notrace void +static void lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, unsigned long rel_usecs) { @@ -1142,7 +1142,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static notrace int +static int print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; @@ -1206,7 +1206,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) return 1; } -static notrace int print_trace_fmt(struct trace_iterator *iter) +static int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1279,7 +1279,7 @@ static notrace int print_trace_fmt(struct trace_iterator *iter) return 1; } -static notrace int print_raw_fmt(struct trace_iterator *iter) +static int print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1336,7 +1336,7 @@ do { \ return 0; \ } while (0) -static notrace int print_hex_fmt(struct trace_iterator *iter) +static int print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; @@ -1375,7 +1375,7 @@ static notrace int print_hex_fmt(struct trace_iterator *iter) return 1; } -static notrace int print_bin_fmt(struct trace_iterator *iter) +static int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1475,7 +1475,7 @@ static struct seq_operations tracer_seq_ops = { .show = s_show, }; -static struct trace_iterator notrace * +static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; @@ -1572,7 +1572,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file) } -static notrace void * +static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct tracer *t = m->private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index faf9f67246a..2b7352bf1ce 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -150,7 +150,7 @@ struct trace_iterator { long idx; }; -void notrace tracing_reset(struct trace_array_cpu *data); +void tracing_reset(struct trace_array_cpu *data); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void ftrace(struct trace_array *tr, @@ -189,10 +189,10 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -extern notrace cycle_t ftrace_now(int cpu); +extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER -extern void notrace +extern void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); #else static inline void diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 69a0eb00a0a..4165d34bd28 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,7 +16,7 @@ #include "trace.h" -static notrace void function_reset(struct trace_array *tr) +static void function_reset(struct trace_array *tr) { int cpu; @@ -26,30 +26,30 @@ static notrace void function_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_function_trace(struct trace_array *tr) +static void start_function_trace(struct trace_array *tr) { function_reset(tr); tracing_start_function_trace(); } -static notrace void stop_function_trace(struct trace_array *tr) +static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); } -static notrace void function_trace_init(struct trace_array *tr) +static void function_trace_init(struct trace_array *tr) { if (tr->ctrl) start_function_trace(tr); } -static notrace void function_trace_reset(struct trace_array *tr) +static void function_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_function_trace(tr); } -static notrace void function_trace_ctrl_update(struct trace_array *tr) +static void function_trace_ctrl_update(struct trace_array *tr) { if (tr->ctrl) start_function_trace(tr); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2ac0d09db6f..7a4dc014b8a 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -33,7 +33,7 @@ enum { static int trace_type __read_mostly; #ifdef CONFIG_PREEMPT_TRACER -static inline int notrace +static inline int preempt_trace(void) { return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); @@ -43,7 +43,7 @@ preempt_trace(void) #endif #ifdef CONFIG_IRQSOFF_TRACER -static inline int notrace +static inline int irq_trace(void) { return ((trace_type & TRACER_IRQS_OFF) && @@ -67,7 +67,7 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; /* * irqsoff uses its own tracer function to keep the overhead down: */ -static void notrace +static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = irqsoff_trace; @@ -109,7 +109,7 @@ static struct ftrace_ops trace_ops __read_mostly = /* * Should this new latency be reported/recorded? */ -static int notrace report_latency(cycle_t delta) +static int report_latency(cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -121,7 +121,7 @@ static int notrace report_latency(cycle_t delta) return 1; } -static void notrace +static void check_critical_timing(struct trace_array *tr, struct trace_array_cpu *data, unsigned long parent_ip, @@ -191,7 +191,7 @@ out: trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); } -static inline void notrace +static inline void start_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; @@ -228,7 +228,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); } -static inline void notrace +static inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; @@ -261,13 +261,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) } /* start and stop critical timings used to for stoppage (in idle) */ -void notrace start_critical_timings(void) +void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -void notrace stop_critical_timings(void) +void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); @@ -275,13 +275,13 @@ void notrace stop_critical_timings(void) #ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING -void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +void time_hardirqs_on(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } -void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); @@ -309,35 +309,35 @@ void trace_softirqs_off(unsigned long ip) { } -inline notrace void print_irqtrace_events(struct task_struct *curr) +inline void print_irqtrace_events(struct task_struct *curr) { } /* * We are only interested in hardirq on/off events: */ -void notrace trace_hardirqs_on(void) +void trace_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); -void notrace trace_hardirqs_off(void) +void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); -void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); @@ -348,12 +348,12 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +void trace_preempt_on(unsigned long a0, unsigned long a1) { stop_critical_timing(a0, a1); } -void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +void trace_preempt_off(unsigned long a0, unsigned long a1) { start_critical_timing(a0, a1); } @@ -395,14 +395,14 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr) stop_irqsoff_tracer(tr); } -static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +static void irqsoff_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ if (iter->tr->ctrl) stop_irqsoff_tracer(iter->tr); } -static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +static void irqsoff_tracer_close(struct trace_iterator *iter) { if (iter->tr->ctrl) start_irqsoff_tracer(iter->tr); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8d656672da9..b738eaca1db 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -17,7 +17,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; -static void notrace +static void ctx_switch_func(struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = ctx_trace; @@ -57,7 +57,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) wakeup_sched_switch(prev, next); } -static notrace void sched_switch_reset(struct trace_array *tr) +static void sched_switch_reset(struct trace_array *tr) { int cpu; @@ -67,18 +67,18 @@ static notrace void sched_switch_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_sched_trace(struct trace_array *tr) +static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); tracer_enabled = 1; } -static notrace void stop_sched_trace(struct trace_array *tr) +static void stop_sched_trace(struct trace_array *tr) { tracer_enabled = 0; } -static notrace void sched_switch_trace_init(struct trace_array *tr) +static void sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; @@ -86,7 +86,7 @@ static notrace void sched_switch_trace_init(struct trace_array *tr) start_sched_trace(tr); } -static notrace void sched_switch_trace_reset(struct trace_array *tr) +static void sched_switch_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_sched_trace(tr); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3549e4154f1..662679c78b6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -27,12 +27,12 @@ static unsigned wakeup_prio = -1; static DEFINE_SPINLOCK(wakeup_lock); -static void notrace __wakeup_reset(struct trace_array *tr); +static void __wakeup_reset(struct trace_array *tr); /* * Should this new latency be reported/recorded? */ -static int notrace report_latency(cycle_t delta) +static int report_latency(cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -44,7 +44,7 @@ static int notrace report_latency(cycle_t delta) return 1; } -void notrace +void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; @@ -126,7 +126,7 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -static void notrace __wakeup_reset(struct trace_array *tr) +static void __wakeup_reset(struct trace_array *tr) { struct trace_array_cpu *data; int cpu; @@ -147,7 +147,7 @@ static void notrace __wakeup_reset(struct trace_array *tr) wakeup_task = NULL; } -static void notrace wakeup_reset(struct trace_array *tr) +static void wakeup_reset(struct trace_array *tr) { unsigned long flags; @@ -156,7 +156,7 @@ static void notrace wakeup_reset(struct trace_array *tr) spin_unlock_irqrestore(&wakeup_lock, flags); } -static notrace void +static void wakeup_check_start(struct trace_array *tr, struct task_struct *p, struct task_struct *curr) { @@ -201,7 +201,7 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -notrace void +void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) @@ -210,7 +210,7 @@ ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) wakeup_check_start(wakeup_trace, wakee, curr); } -notrace void +void ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) @@ -219,7 +219,7 @@ ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) wakeup_check_start(wakeup_trace, wakee, curr); } -static notrace void start_wakeup_tracer(struct trace_array *tr) +static void start_wakeup_tracer(struct trace_array *tr) { wakeup_reset(tr); @@ -237,12 +237,12 @@ static notrace void start_wakeup_tracer(struct trace_array *tr) return; } -static notrace void stop_wakeup_tracer(struct trace_array *tr) +static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; } -static notrace void wakeup_tracer_init(struct trace_array *tr) +static void wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; @@ -250,7 +250,7 @@ static notrace void wakeup_tracer_init(struct trace_array *tr) start_wakeup_tracer(tr); } -static notrace void wakeup_tracer_reset(struct trace_array *tr) +static void wakeup_tracer_reset(struct trace_array *tr) { if (tr->ctrl) { stop_wakeup_tracer(tr); @@ -267,14 +267,14 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr) stop_wakeup_tracer(tr); } -static void notrace wakeup_tracer_open(struct trace_iterator *iter) +static void wakeup_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ if (iter->tr->ctrl) stop_wakeup_tracer(iter->tr); } -static void notrace wakeup_tracer_close(struct trace_iterator *iter) +static void wakeup_tracer_close(struct trace_iterator *iter) { /* forget about any processes we were recording */ if (iter->tr->ctrl) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 546307de6e3..85715b86a34 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,7 +3,7 @@ #include #include -static notrace inline int trace_valid_entry(struct trace_entry *entry) +static inline int trace_valid_entry(struct trace_entry *entry) { switch (entry->type) { case TRACE_FN: -- cgit v1.2.3 From 57422797dc009fc83766bcf230d29dbe6e08e21e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: add wakeup events to sched tracer Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 42 +++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 12 +++++++++++ kernel/trace/trace_sched_switch.c | 36 +++++++++++++++++++++++++++++++++ kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 1 + 5 files changed, 88 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5898051fdd..192c1354a7e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -57,7 +57,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); static int tracer_enabled = 1; -static unsigned long trace_nr_entries = 16384UL; +static unsigned long trace_nr_entries = 65536UL; static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; @@ -87,6 +87,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, + TRACE_WAKE, TRACE_SPECIAL, __TRACE_LAST_TYPE @@ -711,6 +712,30 @@ tracing_sched_switch_trace(struct trace_array *tr, wake_up (&trace_wait); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, struct task_struct *curr, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_WAKE; + entry->ctx.prev_pid = curr->pid; + entry->ctx.prev_prio = curr->prio; + entry->ctx.prev_state = curr->state; + entry->ctx.next_pid = wakee->pid; + entry->ctx.next_prio = wakee->prio; + spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up(&trace_wait); +} + #ifdef CONFIG_FTRACE static void function_trace_call(unsigned long ip, unsigned long parent_ip) @@ -1183,13 +1208,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_puts(s, ")\n"); break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n", entry->ctx.prev_pid, entry->ctx.prev_prio, - S, + S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, comm); @@ -1256,12 +1282,14 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, + entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio); if (!ret) @@ -1301,8 +1329,11 @@ static int print_raw_fmt(struct trace_iterator *iter) return 0; break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; ret = trace_seq_printf(s, "%d %d %c %d %d\n", entry->ctx.prev_pid, entry->ctx.prev_prio, @@ -1355,8 +1386,11 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2b7352bf1ce..90e0ba0f6eb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -164,6 +164,12 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags); void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, @@ -194,11 +200,17 @@ extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER extern void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); +extern void +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr); #else static inline void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) { } +static inline void +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +{ +} #endif #ifdef CONFIG_CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b738eaca1db..8b1cf1a3aee 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -41,6 +41,29 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) local_irq_restore(flags); } +static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) { tracing_record_cmdline(prev); @@ -57,6 +80,19 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) wakeup_sched_switch(prev, next); } +void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ + tracing_record_cmdline(curr); + + wakeup_func(wakee, curr); + + /* + * Chain to the wakeup tracer (this is a NOP if disabled): + */ + wakeup_sched_wakeup(wakee, curr); +} + static void sched_switch_reset(struct trace_array *tr) { int cpu; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 662679c78b6..87fa7b253b5 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -202,7 +202,7 @@ out: } void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 85715b86a34..39dd452647d 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -8,6 +8,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) switch (entry->type) { case TRACE_FN: case TRACE_CTX: + case TRACE_WAKE: return 1; } return 0; -- cgit v1.2.3 From 86387f7ee5d3273ff4859e2c64ce656639b6ca65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: add stack tracing Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 1 + kernel/trace/trace.c | 103 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 11 ++++++ 3 files changed, 97 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3f73a171024..eb1988ed84b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,7 @@ config TRACER_MAX_TRACE config TRACING bool select DEBUG_FS + select STACKTRACE config FTRACE bool "Kernel Function Tracer" diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 192c1354a7e..b4b1b4fe99f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -28,6 +28,8 @@ #include #include +#include + #include "trace.h" unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; @@ -88,6 +90,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_STACK, TRACE_SPECIAL, __TRACE_LAST_TYPE @@ -109,6 +112,7 @@ enum trace_iterator_flags { TRACE_ITER_HEX = 0x20, TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, }; #define TRACE_ITER_SYM_MASK \ @@ -124,10 +128,11 @@ static const char *trace_options[] = { "hex", "bin", "block", + "stacktrace", NULL }; -static unsigned trace_flags; +static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE; static DEFINE_SPINLOCK(ftrace_max_lock); @@ -657,7 +662,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void @@ -685,13 +690,39 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); +} + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); } void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *prev, struct task_struct *next, + struct task_struct *prev, + struct task_struct *next, unsigned long flags) { struct trace_entry *entry; @@ -706,16 +737,18 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *wakee, struct task_struct *curr, + struct task_struct *wakee, + struct task_struct *curr, unsigned long flags) { struct trace_entry *entry; @@ -730,6 +763,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.prev_state = curr->state; entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; + __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) @@ -1179,6 +1213,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned long rel_usecs; char *comm; int S; + int i; if (!next_entry) next_entry = entry; @@ -1197,8 +1232,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); + if (entry->type != TRACE_STACK) { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } } switch (entry->type) { case TRACE_FN: @@ -1226,6 +1263,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) entry->special.arg2, entry->special.arg3); break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1241,8 +1286,9 @@ static int print_trace_fmt(struct trace_iterator *iter) unsigned long long t; unsigned long secs; char *comm; - int S; int ret; + int S; + int i; entry = iter->ent; @@ -1252,15 +1298,17 @@ static int print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; + if (entry->type != TRACE_STACK) { + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + } switch (entry->type) { case TRACE_FN: @@ -1303,6 +1351,22 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; } return 1; } @@ -1344,6 +1408,7 @@ static int print_raw_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: + case TRACE_STACK: ret = trace_seq_printf(s, " %lx %lx %lx\n", entry->special.arg1, entry->special.arg2, @@ -1399,6 +1464,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); @@ -1433,6 +1499,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_FIELD_RET(s, entry->special.arg1); SEQ_PUT_FIELD_RET(s, entry->special.arg2); SEQ_PUT_FIELD_RET(s, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90e0ba0f6eb..387bdcf45e2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,6 +34,16 @@ struct special_entry { unsigned long arg3; }; +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 5 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -51,6 +61,7 @@ struct trace_entry { struct ftrace_entry fn; struct ctx_switch_entry ctx; struct special_entry special; + struct stack_entry stack; }; }; -- cgit v1.2.3 From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: sched tracer fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- kernel/trace/trace_sched_wakeup.c | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 328494e28df..53ab1174664 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_new_task(p, rq->curr); + ftrace_wake_up_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 87fa7b253b5..2a012423f9d 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -201,20 +201,13 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) return; - wakeup_check_start(wakeup_trace, wakee, curr); -} - -void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) -{ - if (likely(!tracer_enabled)) - return; + tracing_record_cmdline(curr); + tracing_record_cmdline(wakee); wakeup_check_start(wakeup_trace, wakee, curr); } -- cgit v1.2.3 From 4c1f4d4f0175129934d5dbc19a39296430937a05 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: make nostacktrace the default Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4b1b4fe99f..0e4b7119e26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ static const char *trace_options[] = { NULL }; -static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE; +static unsigned trace_flags = TRACE_ITER_PRINT_PARENT; static DEFINE_SPINLOCK(ftrace_max_lock); -- cgit v1.2.3 From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: sched tracer, trace full rbtree Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 35 ++++++++++++++++++++++--- kernel/trace/trace.c | 55 ++++++++++++++++----------------------- kernel/trace/trace.h | 14 ++++++++++ kernel/trace/trace_sched_switch.c | 24 +++++++++++------ 4 files changed, 85 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 53ab1174664..b9208a0e33a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + +void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) +{ + struct sched_entity *se; + struct task_struct *p; + struct rb_node *curr; + struct rq *rq = __rq; + + curr = first_fair(&rq->cfs); + if (!curr) + return; + + while (curr) { + se = rb_entry(curr, struct sched_entity, run_node); + if (!entity_is_task(se)) + continue; + + p = task_of(se); + + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + + curr = rb_next(curr); + } +} + +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(prev, next); + ftrace_ctx_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e4b7119e26..65173b14b91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -66,7 +66,18 @@ static struct tracer *current_trace __read_mostly; static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); -static DECLARE_WAIT_QUEUE_HEAD (trace_wait); +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/* + * FIXME: where should this be called? + */ +void trace_wake_up(void) +{ + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up(&trace_wait); +} #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) @@ -103,18 +114,6 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, -}; - #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) @@ -132,8 +131,6 @@ static const char *trace_options[] = { NULL }; -static unsigned trace_flags = TRACE_ITER_PRINT_PARENT; - static DEFINE_SPINLOCK(ftrace_max_lock); /* @@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + void -trace_special(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) { + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; struct trace_entry *entry; unsigned long irq_flags; @@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, @@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_prio = next->prio; __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } #ifdef CONFIG_FTRACE @@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %lx %lx %lx\n", + trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 387bdcf45e2..75e23747567 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); +extern unsigned long trace_flags; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, +}; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8b1cf1a3aee..12658b3f2b2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -18,7 +18,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static void -ctx_switch_func(struct task_struct *prev, struct task_struct *next) +ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_switch_trace(tr, data, prev, next, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) +static void +wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +void +ftrace_ctx_switch(void *__rq, struct task_struct *prev, + struct task_struct *next) { tracing_record_cmdline(prev); @@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. */ - ctx_switch_func(prev, next); + ctx_switch_func(__rq, prev, next); /* * Chain to the wakeup tracer (this is a NOP if disabled): @@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) } void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +ftrace_wake_up_task(void *__rq, struct task_struct *wakee, + struct task_struct *curr) { tracing_record_cmdline(curr); - wakeup_func(wakee, curr); + wakeup_func(__rq, wakee, curr); /* * Chain to the wakeup tracer (this is a NOP if disabled): -- cgit v1.2.3 From 24cd5d111e8c713e62cda7ca1d01232402e3d3c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: trace curr/next tasks Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b9208a0e33a..673b588b713 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2398,8 +2398,8 @@ static int sched_balance_self(int cpu, int flag) void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { - struct sched_entity *se; struct task_struct *p; + struct sched_entity *se; struct rb_node *curr; struct rq *rq = __rq; @@ -2407,6 +2407,17 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) if (!curr) return; + if (rq->cfs.curr) { + p = task_of(rq->cfs.curr); + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + } + if (rq->cfs.next) { + p = task_of(rq->cfs.next); + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + } + while (curr) { se = rb_entry(curr, struct sched_entity, run_node); if (!entity_is_task(se)) -- cgit v1.2.3 From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix wakeups Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 18 ++++++++++++++++++ kernel/trace/trace.c | 15 +++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 673b588b713..9ca4a2e6a23 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 65173b14b91..2ca9d66aa74 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -/* - * FIXME: where should this be called? - */ void trace_wake_up(void) { - if (!(trace_flags & TRACE_ITER_BLOCK)) + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) wake_up(&trace_wait); } @@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } void @@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #endif @@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #ifdef CONFIG_FTRACE -- cgit v1.2.3 From 1a3c3034336320554a3342572dae98d69e054fc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix __trace_special() Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2ca9d66aa74..65d2c0a61ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -670,8 +670,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -#ifdef CONFIG_CONTEXT_SWITCH_TRACER - void __trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3) @@ -693,8 +691,6 @@ __trace_special(void *__tr, void *__data, trace_wake_up(); } -#endif - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, -- cgit v1.2.3 From 4ac3ba41d372e3a9e420b36bc43589662b188a14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: trace scheduler rbtree Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_sched_switch.c | 6 ++++-- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 65d2c0a61ed..06380dc1ebe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,6 +129,7 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", + "sched-tree", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 75e23747567..a52015702a2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -286,6 +286,7 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_SCHED_TREE = 0x200, }; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 12658b3f2b2..5555b906a66 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -36,7 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (likely(disabled == 1)) { tracing_sched_switch_trace(tr, data, prev, next, flags); - ftrace_all_fair_tasks(__rq, tr, data); + if (trace_flags & TRACE_ITER_SCHED_TREE) + ftrace_all_fair_tasks(__rq, tr, data); } atomic_dec(&data->disabled); @@ -62,7 +63,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) if (likely(disabled == 1)) { tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); - ftrace_all_fair_tasks(__rq, tr, data); + if (trace_flags & TRACE_ITER_SCHED_TREE) + ftrace_all_fair_tasks(__rq, tr, data); } atomic_dec(&data->disabled); -- cgit v1.2.3 From c7078de1aaf562a31b20984409c38cc1b40fa8a3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: add tracing_cpumask Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06380dc1ebe..18f0ab88ca8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,6 +70,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask: + */ +static char mask_str[NR_CPUS]; + void trace_wake_up(void) { /* @@ -1754,9 +1771,49 @@ static struct file_operations tracing_lt_fops = { }; static struct file_operations show_traces_fops = { - .open = show_traces_open, - .read = seq_read, - .release = seq_release, + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err; + + count = min(count, (size_t)NR_CPUS); + + mutex_lock(&tracing_cpumask_update_lock); + cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask); + err = copy_to_user(ubuf, mask_str, count); + if (err) + count = -EFAULT; + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err; + + mutex_lock(&tracing_cpumask_update_lock); + err = cpumask_parse_user(ubuf, count, tracing_cpumask); + mutex_unlock(&tracing_cpumask_update_lock); + + if (err) + return err; + + return count; +} + +static struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, }; static ssize_t @@ -1837,9 +1894,9 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, } static struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, }; static const char readme_msg[] = @@ -1870,8 +1927,8 @@ tracing_readme_read(struct file *filp, char __user *ubuf, } static struct file_operations tracing_readme_fops = { - .open = tracing_open_generic, - .read = tracing_readme_read, + .open = tracing_open_generic, + .read = tracing_readme_read, }; static ssize_t @@ -2334,6 +2391,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + entry = debugfs_create_file("latency_trace", 0444, d_tracer, &global_trace, &tracing_lt_fops); if (!entry) -- cgit v1.2.3 From 36dfe9252bd4c9b55e8454363fb7e444c92c5030 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: make use of tracing_cpumask Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 95 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 18f0ab88ca8..3ae17e254fc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,23 +70,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL; - -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask: - */ -static char mask_str[NR_CPUS]; - void trace_wake_up(void) { /* @@ -1776,19 +1759,46 @@ static struct file_operations show_traces_fops = { .release = seq_release, }; +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask = CPU_MASK_ALL; + +/* + * When tracing/tracing_cpu_mask is modified then this holds + * the new bitmask we are about to install: + */ +static cpumask_t tracing_cpumask_new; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - int err; - - count = min(count, (size_t)NR_CPUS); + int len; mutex_lock(&tracing_cpumask_update_lock); - cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask); - err = copy_to_user(ubuf, mask_str, count); - if (err) - count = -EFAULT; + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: mutex_unlock(&tracing_cpumask_update_lock); return count; @@ -1798,16 +1808,40 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err; + int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask); - mutex_unlock(&tracing_cpumask_update_lock); - + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) - return err; + goto err_unlock; + + spin_lock_irq(&ftrace_max_lock); + for_each_possible_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpu_isset(cpu, tracing_cpumask) && + !cpu_isset(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpu_isset(cpu, tracing_cpumask) && + cpu_isset(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + spin_unlock_irq(&ftrace_max_lock); + + tracing_cpumask = tracing_cpumask_new; + + mutex_unlock(&tracing_cpumask_update_lock); return count; + +err_unlock: + mutex_unlock(&tracing_cpumask_update_lock); + + return err; } static struct file_operations tracing_cpumask_fops = { @@ -1846,8 +1880,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); -- cgit v1.2.3 From d9af56fbd8b37181bffaad060f74aa2e17382874 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: fix cmdline tracing Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sched_switch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5555b906a66..5a217e86358 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,6 +29,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; + tracing_record_cmdline(prev); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -56,6 +58,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) if (!tracer_enabled) return; + tracing_record_cmdline(curr); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -75,8 +79,6 @@ void ftrace_ctx_switch(void *__rq, struct task_struct *prev, struct task_struct *next) { - tracing_record_cmdline(prev); - /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -93,8 +95,6 @@ void ftrace_wake_up_task(void *__rq, struct task_struct *wakee, struct task_struct *curr) { - tracing_record_cmdline(curr); - wakeup_func(__rq, wakee, curr); /* -- cgit v1.2.3 From 442e544ce52d4415a024920b84fb95c5f9aa0855 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: iter ctrl fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ae17e254fc..688b4cf72d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1920,6 +1920,11 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, break; } } + /* + * If no option could be set, return an error: + */ + if (!trace_options[i]) + return -EINVAL; filp->f_pos += cnt; -- cgit v1.2.3 From f29c73fe3404f8799ed57aaf48859e0b55fc071f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: include cpu in stacktrace Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 688b4cf72d9..3a4032492fc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1227,10 +1227,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); } else { - if (entry->type != TRACE_STACK) { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); - } + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { case TRACE_FN: @@ -1293,17 +1291,15 @@ static int print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - if (entry->type != TRACE_STACK) { - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; - } + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; switch (entry->type) { case TRACE_FN: -- cgit v1.2.3 From 36fc25a9f48deacd8aa08cd2d1c186a4e412604f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: sched tree fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9ca4a2e6a23..3bc7c536299 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2414,6 +2414,23 @@ static int sched_balance_self(int cpu, int flag) #ifdef CONFIG_CONTEXT_SWITCH_TRACER +void ftrace_task(struct task_struct *p, void *__tr, void *__data) +{ +#if 0 + /* + * trace timeline tree + */ + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); +#else + /* + * trace balance metrics + */ + __trace_special(__tr, __data, + p->pid, p->se.avg_overlap, 0); +#endif +} + void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { struct task_struct *p; @@ -2421,32 +2438,22 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) struct rb_node *curr; struct rq *rq = __rq; - curr = first_fair(&rq->cfs); - if (!curr) - return; - if (rq->cfs.curr) { p = task_of(rq->cfs.curr); - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); + ftrace_task(p, __tr, __data); } if (rq->cfs.next) { p = task_of(rq->cfs.next); - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); + ftrace_task(p, __tr, __data); } - while (curr) { + for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) { se = rb_entry(curr, struct sched_entity, run_node); if (!entity_is_task(se)) continue; p = task_of(se); - - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); - - curr = rb_next(curr); + ftrace_task(p, __tr, __data); } } -- cgit v1.2.3 From 88a4216c3ec4281fc7e6725cc3a3ccd01fb1aa14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: sched special Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched_fair.c | 3 +++ kernel/trace/trace.c | 6 +++--- kernel/trace/trace_sched_switch.c | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b..dc1856f1079 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1061,6 +1061,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE)) return 0; + ftrace_special(__LINE__, curr->se.avg_overlap, sync); + ftrace_special(__LINE__, p->se.avg_overlap, -1); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly @@ -1238,6 +1240,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + ftrace_special(__LINE__, p->pid, se->last_wakeup); cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a4032492fc..b87a2641489 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1251,7 +1251,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %ld %ld %ld\n", + trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1335,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1400,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5a217e86358..bddf676914e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -103,6 +103,30 @@ ftrace_wake_up_task(void *__rq, struct task_struct *wakee, wakeup_sched_wakeup(wakee, curr); } +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static void sched_switch_reset(struct trace_array *tr) { int cpu; -- cgit v1.2.3 From bac524d3f3dfeffa3a9d44f2c64035b88bcaacb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: trace next state Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 35 +++++++++++++++++++++++++---------- kernel/trace/trace.h | 1 + 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b87a2641489..b63fe909f87 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -736,6 +736,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + entry->ctx.next_state = next->state; __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); } @@ -759,6 +760,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.prev_state = curr->state; entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; + entry->ctx.next_state = wakee->state; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); @@ -1207,7 +1209,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned long abs_usecs; unsigned long rel_usecs; char *comm; - int S; + int S, T; int i; if (!next_entry) @@ -1241,14 +1243,17 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, - comm); + T, comm); break; case TRACE_SPECIAL: trace_seq_printf(s, "# %ld %ld %ld\n", @@ -1280,7 +1285,7 @@ static int print_trace_fmt(struct trace_iterator *iter) unsigned long secs; char *comm; int ret; - int S; + int S, T; int i; entry = iter->ent; @@ -1324,13 +1329,16 @@ static int print_trace_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n", + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, - entry->ctx.next_prio); + entry->ctx.next_prio, + T); if (!ret) return 0; break; @@ -1367,7 +1375,7 @@ static int print_raw_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry; int ret; - int S; + int S, T; entry = iter->ent; @@ -1387,14 +1395,17 @@ static int print_raw_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d\n", + ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->ctx.next_pid, - entry->ctx.next_prio); + entry->ctx.next_prio, + T); if (!ret) return 0; break; @@ -1428,7 +1439,7 @@ static int print_hex_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; - int S; + int S, T; entry = iter->ent; @@ -1445,6 +1456,8 @@ static int print_hex_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); @@ -1453,6 +1466,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, T); break; case TRACE_SPECIAL: case TRACE_STACK: @@ -1488,6 +1502,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); break; case TRACE_SPECIAL: case TRACE_STACK: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a52015702a2..96951a8d09a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -23,6 +23,7 @@ struct ctx_switch_entry { unsigned char prev_state; unsigned int next_pid; unsigned char next_prio; + unsigned char next_state; }; /* -- cgit v1.2.3 From 5429db2d26a59903c81a4f6c6dae7eb9daaea5fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: fix wakeup callback Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3bc7c536299..1ec3fb2efee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2533,7 +2533,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - ftrace_wake_up_task(rq, p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2548,6 +2547,7 @@ out_activate: success = 1; out_running: + ftrace_wake_up_task(rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; -- cgit v1.2.3 From 694379e9ed4f2f6babe111bf001c69e2e263338b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: make it more available in the Kconfig Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index eb1988ed84b..ebc158e6d59 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -14,7 +14,7 @@ config TRACING config FTRACE bool "Kernel Function Tracer" - depends on DEBUG_KERNEL && HAVE_FTRACE + depends on HAVE_FTRACE select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -72,7 +72,6 @@ config PREEMPT_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -82,7 +81,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on DEBUG_KERNEL select TRACING select MARKERS help -- cgit v1.2.3 From 8f96da02c14d722ad9a3713cd7273ce28c9036ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: remove wakeup from function trace trace_function is called by mcount and calling wake_up from that can have unpredictable results. This patch removes the wakeup from trace_function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b63fe909f87..736dcfb3ed0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -659,8 +659,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); - - trace_wake_up(); } void -- cgit v1.2.3 From 4fe8c3048cd8280a54256bca9cac2007bd546c33 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: printk and trace irqsoff and wakeups printk called from wakeup critical timings and irqs off can cause deadlocks since printk might do a wakeup itself. If the call to printk happens with the runqueue lock held, it can deadlock. This patch protects the printk from being called in trace irqs off with a test to see if the runqueue for the current CPU is locked. If it is locked, the printk is skipped. The wakeup always holds the runqueue lock, so the printk is simply removed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_irqsoff.c | 26 ++++++++++++++------------ kernel/trace/trace_sched_wakeup.c | 13 ------------- 2 files changed, 14 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7a4dc014b8a..d0c1748b1e2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -165,18 +165,20 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " %lu us critical section violates %lu us threshold.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum-latency critical section.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency); + if (!runqueue_is_locked()) { + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical" + " section violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh)); + } else { + printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us" + " maximum-latency critical section.\n", + current->comm, current->pid, + raw_smp_processor_id(), + latency); + } } max_sequence++; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2a012423f9d..5948011006b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -106,19 +106,6 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) update_max_tr(tr, wakeup_task, wakeup_cpu); - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " %lu us wakeup latency violates %lu us threshold.\n", - wakeup_task->comm, wakeup_task->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum wakeup latency.\n", - wakeup_task->comm, wakeup_task->pid, - cpu, latency); - } - out_unlock: __wakeup_reset(tr); spin_unlock_irqrestore(&wakeup_lock, flags); -- cgit v1.2.3 From 06fa75ab566c50e01bfd7b055bde85cf9b1bc98a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: add TRACE_STACK and TRACE_SPECIAL to selftest validation The selftest validation code checks for valid entries in the trace buffer. TRACE_STACK and TRACE_SPECIAL have been added to the code but not to the validator. This patch adds the two to prevent them from flagging a failure in the selftest. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_selftest.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 39dd452647d..92f4acb7740 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,6 +9,8 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: + case TRACE_STACK: + case TRACE_SPECIAL: return 1; } return 0; @@ -180,7 +182,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* we should only have one item */ if (!ret && count != 1) { - printk(KERN_CONT ".. filter failed .."); + printk(KERN_CONT ".. filter failed count=%ld ..", count); ret = -1; goto out; } -- cgit v1.2.3 From d05cdb25d80f06f77aa6bddb53cd1390d4d91a0b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: fix dynamic ftrace selftest With the adding of the configuration changes in the Makefile to prevent tracing of functions in the ftrace code, all tracing of all the ftrace code has been removed. Unfortunately, one of the selftests, relied on a function to be traced. With the new change, the function was no longer traced and the test failed. This patch separates out the test function into its own file so that we can add the "-pg" flag to the compilation of that function and the adding of the mcount call to that function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Makefile | 4 ++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_selftest.c | 6 ------ kernel/trace/trace_selftest_dynamic.c | 7 +++++++ 4 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 kernel/trace/trace_selftest_dynamic.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c25a6cd6a52..d9efbbfa2bd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,10 @@ ifdef CONFIG_FTRACE ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 96951a8d09a..98cbfd05d75 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -244,6 +244,8 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); #endif #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 92f4acb7740..83e55a2000c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,14 +107,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) #ifdef CONFIG_DYNAMIC_FTRACE -#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func #define __STR(x) #x #define STR(x) __STR(x) -static int DYN_FTRACE_TEST_NAME(void) -{ - /* used to call mcount */ - return 0; -} /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 00000000000..54dd77cce5b --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,7 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} -- cgit v1.2.3 From 4d9493c90f8e6e1b164aede3814010a290161abb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: remove add-hoc code Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 47 --------------------------------------- kernel/sched_fair.c | 3 --- kernel/trace/trace_sched_switch.c | 10 ++------- 3 files changed, 2 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1ec3fb2efee..ad95cca4e42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,53 +2412,6 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ -#ifdef CONFIG_CONTEXT_SWITCH_TRACER - -void ftrace_task(struct task_struct *p, void *__tr, void *__data) -{ -#if 0 - /* - * trace timeline tree - */ - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); -#else - /* - * trace balance metrics - */ - __trace_special(__tr, __data, - p->pid, p->se.avg_overlap, 0); -#endif -} - -void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) -{ - struct task_struct *p; - struct sched_entity *se; - struct rb_node *curr; - struct rq *rq = __rq; - - if (rq->cfs.curr) { - p = task_of(rq->cfs.curr); - ftrace_task(p, __tr, __data); - } - if (rq->cfs.next) { - p = task_of(rq->cfs.next); - ftrace_task(p, __tr, __data); - } - - for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) { - se = rb_entry(curr, struct sched_entity, run_node); - if (!entity_is_task(se)) - continue; - - p = task_of(se); - ftrace_task(p, __tr, __data); - } -} - -#endif - /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index dc1856f1079..e24ecd39c4b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1061,8 +1061,6 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE)) return 0; - ftrace_special(__LINE__, curr->se.avg_overlap, sync); - ftrace_special(__LINE__, p->se.avg_overlap, -1); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly @@ -1240,7 +1238,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; - ftrace_special(__LINE__, p->pid, se->last_wakeup); cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index bddf676914e..5671db0e182 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -36,11 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { + if (likely(disabled == 1)) tracing_sched_switch_trace(tr, data, prev, next, flags); - if (trace_flags & TRACE_ITER_SCHED_TREE) - ftrace_all_fair_tasks(__rq, tr, data); - } atomic_dec(&data->disabled); local_irq_restore(flags); @@ -65,11 +62,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { + if (likely(disabled == 1)) tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); - if (trace_flags & TRACE_ITER_SCHED_TREE) - ftrace_all_fair_tasks(__rq, tr, data); - } atomic_dec(&data->disabled); local_irq_restore(flags); -- cgit v1.2.3 From c5f888cae49dfe3e86d9d1e0dab2b63ceb057be3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: irqsoff use raw_smp_processor_id This patch changes the use of __get_cpu_var to explicitly calling raw_smp_processor_id and using the per_cpu() macro. On some debug configurations, the use of __get_cpu_var may cause ftrace to trigger and this can cause problems with the irqsoff tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_irqsoff.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d0c1748b1e2..761f3ec66c5 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -204,14 +204,14 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; - if (__get_cpu_var(tracing_cpu)) + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!head_page(data)) || - atomic_read(&data->disabled)) + if (unlikely(!data) || atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); @@ -225,7 +225,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags); - __get_cpu_var(tracing_cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; atomic_dec(&data->disabled); } @@ -238,16 +238,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; + cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ - if (unlikely(__get_cpu_var(tracing_cpu))) - __get_cpu_var(tracing_cpu) = 0; + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; else return; if (!tracer_enabled) return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!head_page(data)) || @@ -255,6 +255,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) return; atomic_inc(&data->disabled); + local_save_flags(flags); trace_function(tr, data, ip, parent_ip, flags); check_critical_timing(tr, data, parent_ip ? : ip, cpu); @@ -376,7 +377,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; - /* make sure that the tracer is visibel */ + /* make sure that the tracer is visible */ smp_wmb(); if (tr->ctrl) -- cgit v1.2.3 From 92205c2343527a863d660360599a4bf8cede77b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: user raw_spin_lock in tracing Lock debugging enabled cause huge performance problems for tracing. Having the lock verification happening for every function that is called because mcount calls spin_lock can cripple the system. This patch converts the spin_locks used by ftrace into raw_spin_locks. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 51 ++++++++++++++++++++++++++++++--------------------- kernel/trace/trace.h | 2 +- 2 files changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 736dcfb3ed0..3009aafa4dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -133,7 +133,8 @@ static const char *trace_options[] = { NULL }; -static DEFINE_SPINLOCK(ftrace_max_lock); +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* * Copy the new maximum trace into the separate maximum-trace @@ -335,7 +336,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) int i; WARN_ON_ONCE(!irqs_disabled()); - spin_lock(&ftrace_max_lock); + __raw_spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; @@ -344,7 +345,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) } __update_max_tr(tr, tsk, cpu); - spin_unlock(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); } /** @@ -360,7 +361,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) int i; WARN_ON_ONCE(!irqs_disabled()); - spin_lock(&ftrace_max_lock); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); @@ -368,7 +369,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_reset(data); __update_max_tr(tr, tsk, cpu); - spin_unlock(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); } int register_tracer(struct tracer *type) @@ -652,13 +653,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); } void @@ -678,14 +681,16 @@ __trace_special(void *__tr, void *__data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_SPECIAL; entry->special.arg1 = arg1; entry->special.arg2 = arg2; entry->special.arg3 = arg3; - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -725,7 +730,8 @@ tracing_sched_switch_trace(struct trace_array *tr, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -736,7 +742,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; __trace_stack(tr, data, flags, 4); - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); } void @@ -749,7 +756,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_WAKE; @@ -760,7 +768,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; __trace_stack(tr, data, flags, 5); - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -1824,7 +1833,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - spin_lock_irq(&ftrace_max_lock); + raw_local_irq_disable(); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -1839,7 +1849,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - spin_unlock_irq(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_enable(); tracing_cpumask = tracing_cpumask_new; @@ -2299,7 +2310,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - spin_lock(&data->lock); + __raw_spin_lock(&data->lock); } while (find_next_entry_inc(iter) != NULL) { @@ -2320,7 +2331,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - spin_unlock(&data->lock); + __raw_spin_unlock(&data->lock); } for_each_cpu_mask(cpu, mask) { @@ -2538,8 +2549,7 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; - spin_lock_init(&data->lock); - lockdep_set_class(&data->lock, &data->lock_key); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2547,8 +2557,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - spin_lock_init(&data->lock); - lockdep_set_class(&data->lock, &data->lock_key); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 98cbfd05d75..25cba28eb9b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -76,7 +76,7 @@ struct trace_entry { struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; /* these fields get copied into max-trace: */ -- cgit v1.2.3 From 1d09daa55d2e9bab7e7d30f0d05e5a7bc60b2a4a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: use Makefile to remove tracing from lockdep This patch removes the "notrace" annotation from lockdep and adds the debugging files in the kernel director to those that should not be compiled with "-pg" mcount tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 8 ++++++++ kernel/lockdep.c | 26 +++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 7e344e7b0cb..d2f80ea4cd9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,14 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +ifdef CONFIG_FTRACE +# Do not profile debug utilities +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ + $(ORIG_CFLAGS), \ + $(subst -pg,,$(ORIG_CFLAGS))) +endif + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ac46847ba0c..90a440cbd6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -notrace void lockdep_off(void) +void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -notrace void lockdep_on(void) +void lockdep_on(void) { current->lockdep_recursion--; } @@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline notrace int +static noinline int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr, * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static notrace void check_chain_key(struct task_struct *curr) +static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, /* * Mark all held locks with a usage bit: */ -static notrace int +static int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2014,7 +2014,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void notrace trace_hardirqs_on_caller(unsigned long a0) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; @@ -2060,7 +2060,7 @@ void notrace trace_hardirqs_on_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void notrace trace_hardirqs_on(void) +void trace_hardirqs_on(void) { trace_hardirqs_on_caller(CALLER_ADDR0); } @@ -2069,7 +2069,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void notrace trace_hardirqs_off_caller(unsigned long a0) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; @@ -2094,7 +2094,7 @@ void notrace trace_hardirqs_off_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_off_caller); -void notrace trace_hardirqs_off(void) +void trace_hardirqs_off(void) { trace_hardirqs_off_caller(CALLER_ADDR0); } @@ -2260,7 +2260,7 @@ static inline int separate_irq_context(struct task_struct *curr, /* * Mark a lock with a usage bit, and validate the state transition: */ -static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, +static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) /* * Check whether we follow the irq-flags state precisely: */ -static notrace void check_flags(unsigned long flags) +static void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2700,7 +2700,7 @@ static notrace void check_flags(unsigned long flags) * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2723,7 +2723,7 @@ notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -notrace void lock_release(struct lockdep_map *lock, int nested, +void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; -- cgit v1.2.3 From c1d2327b36f2261ffa8ff7227321ba900c7eee7f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: restrict tracing to HAVE_FTRACE architectures Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ebc158e6d59..f3005717bcd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -32,6 +32,7 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME + depends on HAVE_FTRACE select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -54,6 +55,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT + depends on HAVE_FTRACE select TRACING select TRACER_MAX_TRACE help @@ -72,6 +74,7 @@ config PREEMPT_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" + depends on HAVE_FTRACE select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -81,6 +84,7 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" + depends on HAVE_FTRACE select TRACING select MARKERS help -- cgit v1.2.3 From 07a267cdd2fd7d1de9455b1e36a1635ace7276c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: add UNINTERRUPTIBLE state for kftraced on disable When dynamic ftrace fails and sets itself disabled, the ftraced daemon will go back to sleep everytime it wakes up. The setting of the ftraced state to UNINTERRUPTIBLE is skipped in this process, and the daemon takes up 100% of the CPU. This patch makes sure the ftraced daemon sets itself to UNINTERRUPTIBLE in that loop. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 281d97a3208..40f64f7cd85 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -630,10 +630,10 @@ static int ftraced(void *ignore) { unsigned long usecs; - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + /* check once a second */ schedule_timeout(HZ); @@ -667,8 +667,6 @@ static int ftraced(void *ignore) wake_up_interruptible(&ftraced_waiters); ftrace_shutdown_replenish(); - - set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; -- cgit v1.2.3 From d15f57f23eaba975309a153b23699cd0c0236974 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: fix mutex unlock in trace output If the trace output changes on reading the trace files, there is a chance that the start function will return NULL. If the start function of a sequence returns NULL the stop equivalent is not called. In this case, all locks that are taken must be released even if they are released in the stop function. This patch fixes a case that a mutex was not released on return of NULL in the start sequence function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3009aafa4dd..ea11f4ebfae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -964,8 +964,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) mutex_lock(&trace_types_lock); - if (!current_trace || current_trace != iter->trace) + if (!current_trace || current_trace != iter->trace) { + mutex_unlock(&trace_types_lock); return NULL; + } atomic_inc(&trace_record_cmdline_disabled); -- cgit v1.2.3 From 30afdcb1de0a37a2086145a82ca3febebe47d019 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: selftest protect againt max flip There is a slight race condition in the selftest where the max update of the wakeup and irqs/preemption off tests can be doing a max update as the buffers are being tested. If this happens the system can crash with a GPF. This patch adds the max update spinlock around the checking of the buffers to prevent such a race. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_selftest.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 83e55a2000c..395274e783b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -82,10 +82,12 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) */ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) { - unsigned long cnt = 0; - int cpu; - int ret = 0; + unsigned long flags, cnt = 0; + int cpu, ret = 0; + /* Don't allow flipping of max traces now */ + raw_local_irq_save(flags); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(cpu) { if (!head_page(tr->data[cpu])) continue; @@ -96,6 +98,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) if (ret) break; } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_restore(flags); if (count) *count = cnt; -- cgit v1.2.3 From 72829bc3d63cdc592d8f7dd86ad3b3fe8900fb74 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 21:37:28 +0200 Subject: ftrace: move enums to ftrace.h and make helper function global picked from the mmiotracer patches to distangle the patch queues. Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 24 ++++++++---------------- kernel/trace/trace.h | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea11f4ebfae..0eef0503feb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -37,7 +37,7 @@ unsigned long __read_mostly tracing_thresh; static int tracing_disabled = 1; -static long +long ns2usecs(cycle_t nsec) { nsec += 500; @@ -96,18 +96,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -enum trace_type { - __TRACE_FIRST_TYPE = 0, - - TRACE_FN, - TRACE_CTX, - TRACE_WAKE, - TRACE_STACK, - TRACE_SPECIAL, - - __TRACE_LAST_TYPE -}; - enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED = 0x02, @@ -190,7 +178,7 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } -static int +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; @@ -205,7 +193,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret > len) + if (ret >= len) return 0; s->len += ret; @@ -638,7 +626,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); entry->preempt_count = pc & 0xff; - entry->pid = tsk->pid; + entry->pid = (tsk) ? tsk->pid : 0; entry->t = ftrace_now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | @@ -1541,6 +1529,9 @@ static int trace_empty(struct trace_iterator *iter) static int print_trace_line(struct trace_iterator *iter) { + if (iter->trace && iter->trace->print_line) + return iter->trace->print_line(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -2162,6 +2153,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return -ENOMEM; iter->tr = &global_trace; + iter->trace = current_trace; filp->private_data = iter; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 25cba28eb9b..b0ca7473671 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -6,6 +6,18 @@ #include #include +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_SPECIAL, + + __TRACE_LAST_TYPE +}; + /* * Function trace entry - function address and parent function addres: */ @@ -130,6 +142,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif + int (*print_line)(struct trace_iterator *iter); struct tracer *next; int print_max; }; @@ -276,6 +289,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; -- cgit v1.2.3 From d17d969160c18b631a19c2b34d260691402650f8 Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: fix conversion of task state to char in latency tracer The conversion of task states to a character in the sched_switch tracer (part of latency tracer infrastructure), seems to be incorrect. We currently do it by indexing into the state_to_char array using the state value. The state values do not map directly into the array index and are thus incorrect. The following patch addresses this issue. This is also what is being done even in the show_task() routine in kernel/sched.c The patch has been compile and run tested. Signed-off-by: Ankita Garg Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0eef0503feb..9197782d15c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1208,6 +1208,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) char *comm; int S, T; int i; + unsigned state; if (!next_entry) next_entry = entry; @@ -1238,11 +1239,11 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) break; case TRACE_CTX: case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; T = entry->ctx.next_state < sizeof(state_to_char) ? state_to_char[entry->ctx.next_state] : 'X'; + state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", entry->ctx.prev_pid, -- cgit v1.2.3 From 8487c23765b6e0444ec6b5f1530766d63fe68e35 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: allow trace_pipe to block on all reads We expect things like "cat" to block on reads to trace_pipe. That's what trace_pipe is for. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9197782d15c..fd4ecc29200 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2231,8 +2231,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { - if (!(trace_flags & TRACE_ITER_BLOCK)) - return -EWOULDBLOCK; /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: -- cgit v1.2.3 From b5685aede3b7b65e72ddc73b951aa1f70798a614 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: restore iterator trace in pipe read The trace iterator is reset in the read. We still need to restore the tracer that the trace_pipe was opened with. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd4ecc29200..d141fc98f3a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2202,6 +2202,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, { struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; + struct trace_array *tr = iter->tr; + struct tracer *tracer = iter->trace; static cpumask_t mask; static int start; unsigned long flags; @@ -2274,7 +2276,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cnt = PAGE_SIZE - 1; memset(iter, 0, sizeof(*iter)); - iter->tr = &global_trace; + iter->tr = tr; + iter->trace = tracer; iter->pos = -1; /* -- cgit v1.2.3 From 845279972f1736c3463c9cebd1bad92a0a347176 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: return EOF in trace_pipe on change of tracer Break out of while loop with EOF when the current_trace changes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d141fc98f3a..2af940433e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2253,6 +2253,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (signal_pending(current)) return -EINTR; + if (iter->trace != current_trace) + return 0; + /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never -- cgit v1.2.3 From 2dc8f09571a61d9cb3dc47bba6608689dfe15d16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: trace_pipe implement NONBLOCK This patch implements "NONBLOCK" for trace_pipe. If the trace_pipe is opened with O_NONBLOCK, then the trace_pipe read will not block when buffer is empty. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2af940433e9..3b4eaf36ed5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2233,6 +2233,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: -- cgit v1.2.3 From 05bd68c514579e007b46e4fa0461b78416a3f4c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:59 +0200 Subject: ftrace: user proper API for setting RT prios in selftest The wakeup selftest used an internal API for setting the test task priority. This patch fixes it to use the proper API for performing such a task. Thanks goes to Randy Dunlap for pointing out this build failure. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_selftest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 395274e783b..a5f6001c333 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -410,11 +410,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - struct completion *x = data; - /* Make this a RT thread, doesn't need to be too high */ + struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; - rt_mutex_setprio(current, MAX_RT_PRIO - 5); + sched_setscheduler(current, SCHED_FIFO, ¶m); /* Make it know we have a new prio */ complete(x); -- cgit v1.2.3 From a98a3c3fde3ae7614f19758a043691b6f59dac53 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:59 +0200 Subject: ftrace: trace_entries to dynamically change trace buffer size This patch adds /debug/tracing/trace_entries that allows users to see as well as modify the number of trace entries the buffers hold. The number of entries only increments in ENTRIES_PER_PAGE which is calculated by the size of an entry with the number of entries that can fit in a page. The user does not need to use an exact size, but the entries will be rounded to one of the increments. Trying to set the entries to 0 will return with -EINVAL. To avoid race conditions, the modification of the buffer size can only be done when tracing is completely disabled (current_tracer == none). A info message will be printed if a user tries to modify the buffer size when not set to none. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 137 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3b4eaf36ed5..4723e012151 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,6 +35,15 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = +{ + .name = "none", +}; + +static int trace_alloc_page(void); +static int trace_free_page(void); + static int tracing_disabled = 1; long @@ -2364,6 +2373,70 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, return read; } +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%lu\n", tr->entries); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + if (current_trace != &no_tracer) { + cnt = -EBUSY; + pr_info("ftrace: set current_tracer to none" + " before modifying buffer size\n"); + goto out; + } + + if (val > global_trace.entries) { + while (global_trace.entries < val) { + if (trace_alloc_page()) { + cnt = -ENOMEM; + goto out; + } + } + } else { + /* include the number of entries in val (inc of page entries) */ + while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) + trace_free_page(); + } + + filp->f_pos += cnt; + + out: + max_tr.entries = global_trace.entries; + mutex_unlock(&trace_types_lock); + + return cnt; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -2389,6 +2462,12 @@ static struct file_operations tracing_pipe_fops = { .release = tracing_release_pipe, }; +static struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -2500,6 +2579,12 @@ static __init void tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'tracing_threash' entry\n"); + entry = debugfs_create_file("trace_entries", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, @@ -2510,12 +2595,6 @@ static __init void tracer_init_debugfs(void) #endif } -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = -{ - .name = "none", -}; - static int trace_alloc_page(void) { struct trace_array_cpu *data; @@ -2552,7 +2631,6 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2560,7 +2638,6 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2579,6 +2656,55 @@ static int trace_alloc_page(void) return -ENOMEM; } +static int trace_free_page(void) +{ + struct trace_array_cpu *data; + struct page *page; + struct list_head *p; + int i; + int ret = 0; + + /* free one page from each buffer */ + for_each_possible_cpu(i) { + data = global_trace.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); +#endif + } + global_trace.entries -= ENTRIES_PER_PAGE; + + return ret; +} + __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; @@ -2609,6 +2735,9 @@ __init static int tracer_alloc_buffers(void) /* use the LRU flag to differentiate the two buffers */ ClearPageLRU(page); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + /* Only allocate if we are actually using the max trace */ #ifdef CONFIG_TRACER_MAX_TRACE array = (void *)__get_free_page(GFP_KERNEL); -- cgit v1.2.3 From bb065afb8ebd07a03155502dba29ebf0f6fe67e8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: lockdep: update lockdep_recursion on graph_lock With the introduction of ftrace, it is possible to recurse into the lockdep functions via the mcount call. To prevent possible lockups, updating the lockdep_recursion counter on grabbing the internal lockdep_lock should prevent deadlocks. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 90a440cbd6d..65548eff029 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -82,6 +82,8 @@ static int graph_lock(void) __raw_spin_unlock(&lockdep_lock); return 0; } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; return 1; } @@ -90,6 +92,7 @@ static inline int graph_unlock(void) if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); + current->lockdep_recursion--; __raw_spin_unlock(&lockdep_lock); return 0; } -- cgit v1.2.3 From 93dcc6ea096c51cc30ad0f6647ed05fead2439bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: simplify hexprint simplify hex to ascii conversion. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4723e012151..627e39936ea 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -248,19 +248,18 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) } #define HEX_CHARS 17 +static const char hex2asc[] = "0123456789abcdef"; static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; - unsigned char *data; + unsigned char *data = mem; unsigned char byte; int i, j; BUG_ON(len >= HEX_CHARS); - data = mem; - #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -268,22 +267,10 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) #endif byte = data[i]; - hex[j] = byte & 0x0f; - if (hex[j] >= 10) - hex[j] += 'a' - 10; - else - hex[j] += '0'; - j++; - - hex[j] = byte >> 4; - if (hex[j] >= 10) - hex[j] += 'a' - 10; - else - hex[j] += '0'; - j++; + hex[j++] = hex2asc[byte & 0x0f]; + hex[j++] = hex2asc[byte >> 4]; } - hex[j] = ' '; - j++; + hex[j++] = ' '; return trace_seq_putmem(s, hex, j); } -- cgit v1.2.3 From afc2abc0ae4265730a0fc48618012193a09a1d10 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: cleanups no code changed. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 627e39936ea..d6b60576f99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1155,12 +1155,12 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) + if (hardirq && softirq) { trace_seq_putc(s, 'H'); - else { - if (hardirq) + } else { + if (hardirq) { trace_seq_putc(s, 'h'); - else { + } else { if (softirq) trace_seq_putc(s, 's'); else @@ -2177,8 +2177,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } - else { + } else { if (!trace_empty(iter)) return POLLIN | POLLRDNORM; poll_wait(filp, &trace_wait, poll_table); -- cgit v1.2.3 From cffae437cdfbc8a5370d38cefbff1dfe34dad6ca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: simple clean ups Andrew Morton mentioned some clean ups that should be done to ftrace. This patch does some of the simple clean ups. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6b60576f99..0a367362ba2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -36,8 +36,7 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; /* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = -{ +static struct tracer no_tracer __read_mostly = { .name = "none", }; @@ -1906,8 +1905,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, int neg = 0; int i; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -1999,8 +1998,8 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2099,10 +2098,10 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, char buf[64]; int r; - r = snprintf(buf, 64, "%ld\n", + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); - if (r > 64) - r = 64; + if (r > sizeof(buf)) + r = sizeof(buf); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2114,8 +2113,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2378,8 +2377,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; -- cgit v1.2.3 From c6caeeb142cd3a03c46107aac45c8effc02bbadb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: replace simple_strtoul with strict_strtoul Andrew Morton suggested using strict_strtoul over simple_strtoul. This patch replaces them in ftrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0a367362ba2..290e9da7aa9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -92,9 +92,16 @@ void trace_wake_up(void) static int __init set_nr_entries(char *str) { + unsigned long nr_entries; + int ret; + if (!str) return 0; - trace_nr_entries = simple_strtoul(str, &str, 0); + ret = strict_strtoul(str, 0, &nr_entries); + /* nr_entries can not be zero */ + if (ret < 0 || nr_entries == 0) + return 0; + trace_nr_entries = nr_entries; return 1; } __setup("trace_entries=", set_nr_entries); @@ -1995,8 +2002,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - long val; char buf[64]; + long val; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2006,7 +2014,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; val = !!val; @@ -2110,8 +2120,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long *ptr = filp->private_data; - long val; char buf[64]; + long val; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2121,7 +2132,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; *ptr = val * 1000; @@ -2376,6 +2389,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2385,7 +2399,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; /* must have at least 1 entry */ if (!val) -- cgit v1.2.3 From ab46428c6969d50ecf6f6e97b7a84abba6274368 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: modulize the number of CPU buffers Currently ftrace allocates a trace buffer for every possible CPU. Work is being done to change it to only online CPUs and add hooks to hotplug CPUS. This patch lays out the infrastructure for such a change. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 290e9da7aa9..5da391c5fb0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,6 +35,12 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +static unsigned long __read_mostly tracing_nr_buffers; +static cpumask_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu_mask(cpu, tracing_buffer_mask) + /* dummy trace to disable tracing */ static struct tracer no_tracer __read_mostly = { .name = "none", @@ -328,7 +334,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; flip_trace(max_tr.data[i], data); tracing_reset(data); @@ -352,7 +358,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(i) + for_each_tracing_cpu(i) tracing_reset(max_tr.data[i]); flip_trace(max_tr.data[cpu], data); @@ -398,7 +404,7 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; if (!head_page(data)) continue; @@ -417,7 +423,7 @@ int register_tracer(struct tracer *type) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; if (!head_page(data)) continue; @@ -847,7 +853,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) int next_cpu = -1; int cpu; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); @@ -972,7 +978,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->prev_ent = NULL; iter->prev_cpu = -1; - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { iter->next_idx[i] = 0; iter->next_page[i] = NULL; } @@ -1089,7 +1095,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) if (type) name = type->name; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { if (head_page(tr->data[cpu])) { total += tr->data[cpu]->trace_idx; if (tr->data[cpu]->trace_idx > tr->entries) @@ -1519,7 +1525,7 @@ static int trace_empty(struct trace_iterator *iter) struct trace_array_cpu *data; int cpu; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { data = iter->tr->data[cpu]; if (head_page(data) && data->trace_idx && @@ -1831,7 +1837,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, raw_local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: @@ -2308,7 +2314,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, ftrace_enabled = 0; #endif smp_wmb(); - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { data = iter->tr->data[cpu]; if (!head_page(data) || !data->trace_idx) @@ -2605,7 +2611,7 @@ static int trace_alloc_page(void) int i; /* first allocate a page for each CPU */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { printk(KERN_ERR "tracer: failed to allocate page" @@ -2630,7 +2636,7 @@ static int trace_alloc_page(void) } /* Now that we successfully allocate a page per CPU, add them */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i]; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); @@ -2666,7 +2672,7 @@ static int trace_free_page(void) int ret = 0; /* free one page from each buffer */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i]; p = data->trace_pages.next; if (p == &data->trace_pages) { @@ -2717,8 +2723,12 @@ __init static int tracer_alloc_buffers(void) global_trace.ctrl = tracer_enabled; + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_nr_buffers = num_possible_cpus(); + tracing_buffer_mask = cpu_possible_map; + /* Allocate the first page for all buffers */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); -- cgit v1.2.3 From 4fcdae83cebda24b519a89d3dd976081fff1ca80 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: comment code This is first installment of adding documentation to the ftrace. Expect many more patches of this kind in the near future. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 7 +++ 2 files changed, 141 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5da391c5fb0..a102b11eacf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -64,26 +64,79 @@ cycle_t ftrace_now(int cpu) return cpu_clock(cpu); } +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +/* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; + +/* + * trace_nr_entries is the number of entries that is allocated + * for a buffer. Note, the number of entries is always rounded + * to ENTRIES_PER_PAGE. + */ static unsigned long trace_nr_entries = 65536UL; +/* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ static struct tracer *current_trace __read_mostly; + +/* + * max_tracer_type_len is used to simplify the allocating of + * buffers to read userspace tracer names. We keep track of + * the longest tracer name registered. + */ static int max_tracer_type_len; +/* + * trace_types_lock is used to protect the trace_types list. + * This lock is also used to keep user access serialized. + * Accesses from userspace will grab this lock while userspace + * activities happen inside the kernel. + */ static DEFINE_MUTEX(trace_types_lock); + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); +/* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ void trace_wake_up(void) { /* @@ -117,6 +170,14 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED = 0x02, @@ -124,10 +185,14 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) -/* These must match the bit postions above */ +/* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { "print-parent", "sym-offset", @@ -142,6 +207,15 @@ static const char *trace_options[] = { NULL }; +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ static raw_spinlock_t ftrace_max_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -172,6 +246,13 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +/** + * check_pages - integrity check of trace buffers + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. TODO: configure to disable this because it adds + * a bit of overhead. + */ void check_pages(struct trace_array_cpu *data) { struct page *page, *tmp; @@ -185,6 +266,13 @@ void check_pages(struct trace_array_cpu *data) } } +/** + * head_page - page address of the first page in per_cpu buffer. + * + * head_page returns the page address of the first page in + * a per_cpu buffer. This also preforms various consistency + * checks to make sure the buffer has not been corrupted. + */ void *head_page(struct trace_array_cpu *data) { struct page *page; @@ -199,6 +287,17 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { @@ -222,6 +321,16 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ static int trace_seq_puts(struct trace_seq *s, const char *str) { @@ -304,6 +413,13 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } +/* + * flip the trace buffers between two trace descriptors. + * This usually is the buffers between the global_trace and + * the max_tr to record a snapshot of a current trace. + * + * The ftrace_max_lock must be held. + */ static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { @@ -325,6 +441,15 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) check_pages(tr2); } +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { @@ -349,6 +474,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr - tracer * @tsk - task with the latency * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -368,6 +495,12 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __raw_spin_unlock(&ftrace_max_lock); } +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ int register_tracer(struct tracer *type) { struct tracer *t; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b0ca7473671..21c29ee13e5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -294,6 +294,13 @@ extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ enum trace_iterator_flags { TRACE_ITER_PRINT_PARENT = 0x01, TRACE_ITER_SYM_OFFSET = 0x02, -- cgit v1.2.3 From 25b0b44a1c732ccfc58095cdd8438955a0a19fff Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: fix comm on function trace output In cleaning up of the sched_switch code, the function trace recording of task comms was removed. This patch adds back the recording of comms for function trace. The output of ftrace now has the task comm instead of <...>. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace.h | 2 ++ kernel/trace/trace_functions.c | 2 ++ kernel/trace/trace_sched_switch.c | 7 +++++-- 4 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a102b11eacf..1281969103b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -620,7 +620,12 @@ static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; static DEFINE_SPINLOCK(trace_cmdline_lock); -atomic_t trace_record_cmdline_disabled; + +/* trace in all context switches */ +atomic_t trace_record_cmdline_enabled __read_mostly; + +/* temporary disable recording */ +atomic_t trace_record_cmdline_disabled __read_mostly; static void trace_init_cmdlines(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 21c29ee13e5..8991c5efcc7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -216,6 +216,8 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; +extern atomic_t trace_record_cmdline_enabled; + void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 4165d34bd28..0a084656d7c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -29,12 +29,14 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { function_reset(tr); + atomic_inc(&trace_record_cmdline_enabled); tracing_start_function_trace(); } static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); + atomic_dec(&trace_record_cmdline_enabled); } static void function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5671db0e182..a3376478fc2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,8 +29,6 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; - tracing_record_cmdline(prev); - local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -73,6 +71,9 @@ void ftrace_ctx_switch(void *__rq, struct task_struct *prev, struct task_struct *next) { + if (unlikely(atomic_read(&trace_record_cmdline_enabled))) + tracing_record_cmdline(prev); + /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -134,11 +135,13 @@ static void sched_switch_reset(struct trace_array *tr) static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); + atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; } static void stop_sched_trace(struct trace_array *tr) { + atomic_dec(&trace_record_cmdline_enabled); tracer_enabled = 0; } -- cgit v1.2.3 From 53d0aa773053ab18287781e25d52c5faf9e0e09e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: add logic to record overruns This patch sets up the infrastructure to record overruns of the tracing buffer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 16 +++++++++++----- kernel/trace/trace.h | 6 +++++- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1281969103b..b9126ef46a9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -609,6 +609,7 @@ void unregister_tracer(struct tracer *type) void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; + data->overrun = 0; data->trace_head = data->trace_tail = head_page(data); data->trace_head_idx = 0; data->trace_tail_idx = 0; @@ -750,6 +751,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) if (data->trace_head == data->trace_tail && idx_next == data->trace_tail_idx) { /* overrun */ + data->overrun++; data->trace_tail_idx++; if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { data->trace_tail = @@ -2353,8 +2355,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, { struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; - struct trace_array *tr = iter->tr; - struct tracer *tracer = iter->trace; static cpumask_t mask; static int start; unsigned long flags; @@ -2433,10 +2433,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; - memset(iter, 0, sizeof(*iter)); - iter->tr = tr; - iter->trace = tracer; + /* reset all but tr, trace, and overruns */ iter->pos = -1; + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); /* * We need to stop all tracing on all CPUS to read the @@ -2465,6 +2466,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; __raw_spin_lock(&data->lock); + + if (data->overrun > iter->last_overrun[cpu]) + iter->overrun[cpu] += + data->overrun - iter->last_overrun[cpu]; + iter->last_overrun[cpu] = data->overrun; } while (find_next_entry_inc(iter) != NULL) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8991c5efcc7..c1ec134ac35 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -97,6 +97,7 @@ struct trace_array_cpu { void *trace_head; /* producer */ void *trace_tail; /* consumer */ unsigned long trace_idx; + unsigned long overrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -157,10 +158,13 @@ struct trace_seq { * results to users and which routines might sleep, etc: */ struct trace_iterator { - struct trace_seq seq; struct trace_array *tr; struct tracer *trace; + long last_overrun[NR_CPUS]; + long overrun[NR_CPUS]; + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; struct trace_entry *ent; int cpu; -- cgit v1.2.3 From 107bad8bef5ab2c3a3bff7648c18c9dc3abdc13b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: add trace pipe header pluggin This patch adds a method for open_pipe and open_read to the pluggins so that they can add a header to the trace pipe call. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 38 +++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 5 +++++ 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9126ef46a9..32f9106d612 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2307,11 +2307,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + mutex_lock(&trace_types_lock); iter->tr = &global_trace; iter->trace = current_trace; - filp->private_data = iter; + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + mutex_unlock(&trace_types_lock); + return 0; } @@ -2380,13 +2384,24 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, return cnt; } + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + ret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (ret) { + read = ret; + goto out; + } + } + trace_seq_reset(&iter->seq); start = 0; while (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) - return -EAGAIN; + if ((filp->f_flags & O_NONBLOCK)) { + read = -EAGAIN; + goto out; + } /* * This is a make-shift waitqueue. The reason we don't use @@ -2400,16 +2415,22 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, set_current_state(TASK_INTERRUPTIBLE); iter->tr->waiter = current; + mutex_unlock(&trace_types_lock); + /* sleep for one second, and try again. */ schedule_timeout(HZ); + mutex_lock(&trace_types_lock); + iter->tr->waiter = NULL; - if (signal_pending(current)) - return -EINTR; + if (signal_pending(current)) { + read = -EINTR; + goto out; + } if (iter->trace != current_trace) - return 0; + goto out; /* * We block until we read something and tracing is disabled. @@ -2428,7 +2449,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* stop when tracing is finished */ if (trace_empty(iter)) - return 0; + goto out; if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; @@ -2518,6 +2539,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (ret) read = -EFAULT; +out: + mutex_unlock(&trace_types_lock); + return read; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c1ec134ac35..ee53d706066 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -135,9 +135,13 @@ struct tracer { void (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*start)(struct trace_iterator *iter); void (*stop)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); void (*ctrl_update)(struct trace_array *tr); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, @@ -160,6 +164,7 @@ struct trace_seq { struct trace_iterator { struct trace_array *tr; struct tracer *trace; + void *private; long last_overrun[NR_CPUS]; long overrun[NR_CPUS]; -- cgit v1.2.3 From 4823ed7eadf35e4b57ce581327e21d39585f1f32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: fix setting of pos in read_pipe In resetting the iterator in read_pipe, the reset of pos was postitioned in the wrong location with respect to the memset operation. The current code sets pos, incorrectly, to zero. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32f9106d612..49e16630628 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2455,10 +2455,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - iter->pos = -1; memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); + iter->pos = -1; /* * We need to stop all tracing on all CPUS to read the -- cgit v1.2.3 From 9fe068e92f6290e89e19adc521441661a1229f00 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: trace faster Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49e16630628..ca0d6ff74c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2417,8 +2417,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, mutex_unlock(&trace_types_lock); - /* sleep for one second, and try again. */ - schedule_timeout(HZ); + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ/10); mutex_lock(&trace_types_lock); -- cgit v1.2.3 From a4feb8348b62fe76a63cdb5569f5c920f5283c06 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: special stacktrace Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca0d6ff74c1..c232d8248a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -808,29 +808,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array_cpu *data = __data; - struct trace_array *tr = __tr; - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, @@ -856,6 +833,30 @@ void __trace_stack(struct trace_array *tr, save_stack_trace(&trace); } +void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + __trace_stack(tr, data, irq_flags, 4); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, -- cgit v1.2.3 From 2bb6f8d6389cbfadd657e7dc069f6986abf35e4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: use raw_smp_processor_id for mcount functions Due to debug hooks in the kernel that can change the way smp_processor_id works, use raw_smp_processor_id in mcount called functions (namely ftrace_record_ip). Currently we annotate most debug functions from calling mcount, but we should not rely on that to prevent kernel lockups. This patch uses the raw_smp_processor_id to prevent a recusive crash that can happen if a debug hook in smp_processor_id calls mcount. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 40f64f7cd85..af5ad8949ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -267,6 +267,7 @@ ftrace_record_ip(unsigned long ip) unsigned long key; int resched; int atomic; + int cpu; if (!ftrace_enabled || ftrace_disabled) return; @@ -274,9 +275,15 @@ ftrace_record_ip(unsigned long ip) resched = need_resched(); preempt_disable_notrace(); - /* We simply need to protect against recursion */ - __get_cpu_var(ftrace_shutdown_disable_cpu)++; - if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + /* + * We simply need to protect against recursion. + * Use the the raw version of smp_processor_id and not + * __get_cpu_var which can call debug hooks that can + * cause a recursive crash here. + */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) goto out; if (unlikely(ftrace_record_suspend)) @@ -317,7 +324,7 @@ ftrace_record_ip(unsigned long ip) out_unlock: spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); out: - __get_cpu_var(ftrace_shutdown_disable_cpu)--; + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; /* prevent recursion with scheduler */ if (resched) -- cgit v1.2.3 From 6c6c27969a4c6024e6c8838829546c02aaddca18 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: add readpos to struct trace_seq; add trace_seq_to_user() Refactor code from tracing_read_pipe() and create trace_seq_to_user(). Moved trace_seq_reset() call before iter->trace->read() call so that when all leftover data is returned, trace_seq is reset automatically. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++-------------------------- kernel/trace/trace.h | 3 +++ 2 files changed, 39 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c232d8248a0..82ced406aac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -400,6 +400,26 @@ static void trace_seq_reset(struct trace_seq *s) { s->len = 0; + s->readpos = 0; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret) + return -EFAULT; + + s->readpos += len; + return cnt; } static void @@ -2361,46 +2381,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; static cpumask_t mask; - static int start; unsigned long flags; #ifdef CONFIG_FTRACE int ftrace_save; #endif - int read = 0; int cpu; - int len; - int ret; + ssize_t sret; /* return any leftover data */ - if (iter->seq.len > start) { - len = iter->seq.len - start; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt); - if (ret) - cnt = -EFAULT; - - start += len; + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + sret = 0; - return cnt; - } + trace_seq_reset(&iter->seq); mutex_lock(&trace_types_lock); if (iter->trace->read) { - ret = iter->trace->read(iter, filp, ubuf, cnt, ppos); - if (ret) { - read = ret; + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) goto out; - } } - trace_seq_reset(&iter->seq); - start = 0; - while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { - read = -EAGAIN; + sret = -EAGAIN; goto out; } @@ -2426,7 +2432,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, iter->tr->waiter = NULL; if (signal_pending(current)) { - read = -EINTR; + sret = -EINTR; goto out; } @@ -2496,6 +2502,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } while (find_next_entry_inc(iter) != NULL) { + int ret; int len = iter->seq.len; ret = print_trace_line(iter); @@ -2526,24 +2533,16 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, local_irq_restore(flags); /* Now copy what we have to the user */ - read = iter->seq.len; - if (read > cnt) - read = cnt; - - ret = copy_to_user(ubuf, iter->seq.buffer, read); - - if (read < iter->seq.len) - start = read; - else + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) trace_seq_reset(&iter->seq); - - if (ret) - read = -EFAULT; + if (sret == -EBUSY) + sret = 0; out: mutex_unlock(&trace_types_lock); - return read; + return sret; } static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee53d706066..8845033ab49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -155,6 +155,7 @@ struct tracer { struct trace_seq { unsigned char buffer[PAGE_SIZE]; unsigned int len; + unsigned int readpos; }; /* @@ -301,6 +302,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; -- cgit v1.2.3 From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:04 +0200 Subject: ftrace: limit trace entries Currently there is no protection from the root user to use up all of memory for trace buffers. If the root user allocates too many entries, the OOM killer might start kill off all tasks. This patch adds an algorith to check the following condition: pages_requested > (freeable_memory + current_trace_buffer_pages) / 4 If the above is met then the allocation fails. The above prevents more than 1/4th of freeable memory from being used by trace buffers. To determine the freeable_memory, I made determine_dirtyable_memory in mm/page-writeback.c global. Special thanks goes to Peter Zijlstra for suggesting the above calculation. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82ced406aac..2824cf48cdc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -51,6 +52,8 @@ static int trace_free_page(void); static int tracing_disabled = 1; +static unsigned long tracing_pages_allocated; + long ns2usecs(cycle_t nsec) { @@ -2591,12 +2594,41 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + while (global_trace.entries < val) { if (trace_alloc_page()) { cnt = -ENOMEM; goto out; } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; } + } else { /* include the number of entries in val (inc of page entries) */ while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) @@ -2776,6 +2808,7 @@ static int trace_alloc_page(void) struct page *page, *tmp; LIST_HEAD(pages); void *array; + unsigned pages_allocated = 0; int i; /* first allocate a page for each CPU */ @@ -2787,6 +2820,7 @@ static int trace_alloc_page(void) goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); @@ -2798,6 +2832,7 @@ static int trace_alloc_page(void) "for trace buffer!\n"); goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); #endif @@ -2819,6 +2854,7 @@ static int trace_alloc_page(void) SetPageLRU(page); #endif } + tracing_pages_allocated += pages_allocated; global_trace.entries += ENTRIES_PER_PAGE; return 0; @@ -2853,6 +2889,8 @@ static int trace_free_page(void) page = list_entry(p, struct page, lru); ClearPageLRU(page); list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; __free_page(page); tracing_reset(data); -- cgit v1.2.3 From dc102a8fae2d0d6bf5223fc549247f2e23959ae6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:09 +0200 Subject: Markers - remove extra format argument Denys Vlasenko : > Not in this patch, but I noticed: > > #define __trace_mark(name, call_private, format, args...) \ > do { \ > static const char __mstrtab_##name[] \ > __attribute__((section("__markers_strings"))) \ > = #name "\0" format; \ > static struct marker __mark_##name \ > __attribute__((section("__markers"), aligned(8))) = \ > { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ > 0, 0, marker_probe_cb, \ > { __mark_empty_function, NULL}, NULL }; \ > __mark_check_format(format, ## args); \ > if (unlikely(__mark_##name.state)) { \ > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > } \ > } while (0) > > In this call: > > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > > you make gcc allocate duplicate format string. You can use > &__mstrtab_##name[sizeof(#name)] instead since it holds the same string, > or drop ", format," above and "const char *fmt" from here: > > void (*call)(const struct marker *mdata, /* Probe wrapper */ > void *call_private, const char *fmt, ...); > > since mdata->format is the same and all callees which need it can take it there. Very good point. I actually thought about dropping it, since it would remove an unnecessary argument from the stack. And actually, since I now have the marker_probe_cb sitting between the marker site and the callbacks, there is no API change required. Thanks :) Mathieu Signed-off-by: Mathieu Desnoyers CC: Denys Vlasenko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/marker.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index b5a9fe1d50d..1abfb923b76 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex); struct marker_entry { struct hlist_node hlist; char *format; - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ @@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * marker_probe_cb Callback that prepares the variable argument list for probes. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Since we do not use "typical" pointer based RCU in the 1 argument case, we * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, - const char *fmt, ...) +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) { va_list args; char ptype; @@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - va_start(args, fmt); - func(mdata->single.probe_private, call_private, fmt, &args); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); va_end(args); } else { struct marker_probe_closure *multi; @@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) { - va_start(args, fmt); - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); va_end(args); } } @@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * marker_probe_cb Callback that does not prepare the variable argument list. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...) +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - func(mdata->single.probe_private, call_private, fmt, &args); + func(mdata->single.probe_private, call_private, mdata->format, + &args); } else { struct marker_probe_closure *multi; int i; @@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); } preempt_enable(); } -- cgit v1.2.3 From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:10 +0200 Subject: Port ftrace to markers Porting ftrace to the marker infrastructure. Don't need to chain to the wakeup tracer from the sched tracer, because markers support multiple probes connected. Signed-off-by: Mathieu Desnoyers CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 14 +++- kernel/trace/trace.h | 20 +---- kernel/trace/trace_sched_switch.c | 171 +++++++++++++++++++++++++++++++------- kernel/trace/trace_sched_wakeup.c | 106 +++++++++++++++++++++-- 4 files changed, 255 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad95cca4e42..e2e985eeee7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2500,7 +2500,9 @@ out_activate: success = 1; out_running: - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(rq, prev, next); + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8845033ab49..f5de0601b40 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_SCHED_TRACER -extern void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); -extern void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr); -#else -static inline void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) -{ -} -static inline void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) -{ -} -#endif - #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, + void *__rq, struct task_struct *prev, struct task_struct *next); @@ -262,9 +247,6 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; -extern int register_tracer_switch(struct tracer_switch_ops *ops); -extern int unregister_tracer_switch(struct tracer_switch_ops *ops); - #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a3376478fc2..d25ffa5eaf2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,11 +16,14 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; +static atomic_t sched_ref; static void -ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) local_irq_restore(flags); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + static void -wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) +wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct + task_struct *curr) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) local_irq_restore(flags); } -void -ftrace_ctx_switch(void *__rq, struct task_struct *prev, - struct task_struct *next) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { - if (unlikely(atomic_read(&trace_record_cmdline_enabled))) - tracing_record_cmdline(prev); + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - ctx_switch_func(__rq, prev, next); + if (likely(!tracer_enabled)) + return; - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_switch(prev, next); -} + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); -void -ftrace_wake_up_task(void *__rq, struct task_struct *wakee, - struct task_struct *curr) -{ - wakeup_func(__rq, wakee, curr); + tracing_record_cmdline(task); + tracing_record_cmdline(curr); - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_wakeup(wakee, curr); + wakeup_func(probe_data, __rq, task, curr); } void @@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; + tracing_start_sched_switch(); } static void stop_sched_trace(struct trace_array *tr) { + tracing_stop_sched_switch(); atomic_dec(&trace_record_cmdline_enabled); tracer_enabled = 0; } @@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5948011006b..5d2fb48e47f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "trace.h" @@ -44,11 +45,13 @@ static int report_latency(cycle_t delta) return 1; } -void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array *tr = wakeup_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -113,6 +116,31 @@ out: atomic_dec(&tr->data[cpu]->disabled); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + static void __wakeup_reset(struct trace_array *tr) { struct trace_array_cpu *data; @@ -188,19 +216,68 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + if (likely(!tracer_enabled)) return; + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); tracing_record_cmdline(curr); - tracing_record_cmdline(wakee); - wakeup_check_start(wakeup_trace, wakee, curr); + wakeup_check_start(tr, task, curr); } static void start_wakeup_tracer(struct trace_array *tr) { + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + wakeup_reset(tr); /* @@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr) tracer_enabled = 1; return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void wakeup_tracer_init(struct trace_array *tr) -- cgit v1.2.3 From 74f4e369fc5b52433ad824cef32d3bf1304549be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:15 +0200 Subject: ftrace: stacktrace fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/semaphore.c | 2 ++ kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 5c2942e768c..1a064adab65 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -31,6 +31,7 @@ #include #include #include +#include static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -53,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2824cf48cdc..3271916ff03 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -901,7 +901,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 4); + __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); } @@ -927,7 +927,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 5); + __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5de0601b40..c460e85e94e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -51,7 +51,7 @@ struct special_entry { * Stack-trace entry: */ -#define FTRACE_STACK_ENTRIES 5 +#define FTRACE_STACK_ENTRIES 8 struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; -- cgit v1.2.3 From aa5e5ceaf52a882a29d9b86531a20733f5116066 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 13 May 2008 22:06:56 -0700 Subject: ftrace: remove packed attribute on ftrace_page. It causes unaligned access traps on platforms like sparc (ftrace_page may be marked packed, but once we return a dyn_ftrace sub-object from this array to another piece of code, the "packed" part of the typing information doesn't propagate). But also, it didn't serve any purpose either. Even if packed, on 64-bit or 32-bit, it didn't give us any more dyn_ftrace entries per-page. Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index af5ad8949ab..07b2a14943f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -177,9 +177,9 @@ static DEFINE_MUTEX(ftrace_filter_lock); struct ftrace_page { struct ftrace_page *next; - int index; + unsigned long index; struct dyn_ftrace records[]; -} __attribute__((packed)); +}; #define ENTRIES_PER_PAGE \ ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) -- cgit v1.2.3 From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 May 2008 08:10:31 +0200 Subject: ftrace: fix mcount export bug David S. Miller noticed the following bug: the -pg instrumentation function callback is named differently on each platform. On x86 it is mcount, on sparc it is _mcount. So the export does not make sense in kernel/trace/ftrace.c - move it to x86. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 07b2a14943f..a3e47f43f8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -/* mcount is defined per arch in assembly */ -EXPORT_SYMBOL(mcount); - void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; -- cgit v1.2.3 From 2d8b820b2e81954754277723379ae9ed5de316fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Feb 2008 16:55:50 +0100 Subject: ftrace: cleanups factor out code and clean it up. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a3e47f43f8a..89bd9a6f52e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,13 +18,13 @@ #include #include #include -#include #include -#include +#include #include +#include #include -#include #include +#include #include #include "trace.h" -- cgit v1.2.3 From 4e491d14f2506b218d678935c25a7027b79178b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 23:49:44 -0400 Subject: ftrace: support for PowerPC This patch adds full support for ftrace for PowerPC (both 64 and 32 bit). This includes dynamic tracing and function filtering. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_selftest.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a5f6001c333..3877dd9102f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -123,6 +123,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + char *func_name; /* The ftrace test PASSED */ printk(KERN_CONT "PASSED\n"); @@ -142,9 +143,15 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + /* + * Some archs *cough*PowerPC*cough* add charachters to the + * start of the function names. We simply put a '*' to + * accomodate them. + */ + func_name = "*" STR(DYN_FTRACE_TEST_NAME); + /* filter only on our function */ - ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME), - sizeof(STR(DYN_FTRACE_TEST_NAME)), 1); + ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ tr->ctrl = 1; -- cgit v1.2.3 From 6ec562328fda585be2d7f472cfac99d3b44d362a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:30 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d2f80ea4cd9..ca2433e8487 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,12 +11,16 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +CFLAGS_REMOVE_sched.o = -pg -mno-spe + ifdef CONFIG_FTRACE -# Do not profile debug utilities -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ - $(ORIG_CFLAGS), \ - $(subst -pg,,$(ORIG_CFLAGS))) +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v1.2.3 From 677aa9f77e8de3791b481a0cec6c8b84d1eec626 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 17 May 2008 00:01:36 -0400 Subject: ftrace: add have dynamic ftrace config for archs Now that ftrace is being ported to other architectures, it has become apparent that DYNAMIC_FTRACE is dependent on whether or not that architecture implements dynamic ftrace. FTRACE itself may be ported to an architecture without porting dynamic ftrace. This patch adds HAVE_DYNAMIC_FTRACE to allow architectures to port ftrace without having to also port the dynamic aspect as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f3005717bcd..5c2295b29f2 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -4,6 +4,9 @@ config HAVE_FTRACE bool +config HAVE_DYNAMIC_FTRACE + bool + config TRACER_MAX_TRACE bool @@ -94,6 +97,7 @@ config CONTEXT_SWITCH_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE + depends on HAVE_DYNAMIC_FTRACE default y help This option will modify all the calls to ftrace dynamically -- cgit v1.2.3 From f06c38103ea9dbca27c3f4d77f444ddefb5477cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add sysprof plugin very first baby version. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 8 +++++ kernel/trace/Makefile | 1 + kernel/trace/trace_sysprof.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 kernel/trace/trace_sysprof.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5c2295b29f2..e101c9a85f0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -75,6 +75,14 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) +config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on DEBUG_KERNEL + select TRACING + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d9efbbfa2bd..7aec123ec1d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c new file mode 100644 index 00000000000..6c139bc1be7 --- /dev/null +++ b/kernel/trace/trace_sysprof.c @@ -0,0 +1,80 @@ +/* + * trace stack traces + * + * Copyright (C) 2007 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; + +static notrace void stack_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_stack_trace(struct trace_array *tr) +{ + stack_reset(tr); + tracer_enabled = 1; +} + +static notrace void stop_stack_trace(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void stack_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_stack_trace(tr); +} + +static notrace void stack_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_stack_trace(tr); +} + +static void stack_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_stack_trace(tr); + else + stop_stack_trace(tr); +} + +static struct tracer stack_trace __read_mostly = +{ + .name = "sysprof", + .init = stack_trace_init, + .reset = stack_trace_reset, + .ctrl_update = stack_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_stack, +#endif +}; + +__init static int init_stack_trace(void) +{ + return register_tracer(&stack_trace); +} +device_initcall(init_stack_trace); -- cgit v1.2.3 From 0075fa80305f3231a2d5df97b00d7f55a48ea27e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: extend sysprof plugin add per CPU hrtimers. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 67 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 6c139bc1be7..ba55b871b3d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -5,19 +5,76 @@ * Copyright (C) 2008 Ingo Molnar * */ -#include -#include -#include #include +#include +#include #include -#include #include +#include +#include #include "trace.h" static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; +static const unsigned long sample_period = 1000000; + +/* + * Per CPU hrtimers that do the profiling: + */ +static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); + +static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) +{ + /* trace here */ + panic_timeout++; + + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); + + return HRTIMER_RESTART; +} + +static void start_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = stack_trace_timer_fn; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); +} + +static void start_stack_timers(void) +{ + cpumask_t saved_mask = current->cpus_allowed; + int cpu; + + for_each_online_cpu(cpu) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + start_stack_timer(cpu); + printk("started timer on cpu%d\n", cpu); + } + set_cpus_allowed_ptr(current, &saved_mask); +} + +static void stop_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_cancel(hrtimer); + printk("cancelled timer on cpu%d\n", cpu); +} + +static void stop_stack_timers(void) +{ + int cpu; + + for_each_online_cpu(cpu) + stop_stack_timer(cpu); +} + static notrace void stack_reset(struct trace_array *tr) { int cpu; @@ -31,11 +88,13 @@ static notrace void stack_reset(struct trace_array *tr) static notrace void start_stack_trace(struct trace_array *tr) { stack_reset(tr); + start_stack_timers(); tracer_enabled = 1; } static notrace void stop_stack_trace(struct trace_array *tr) { + stop_stack_timers(); tracer_enabled = 0; } -- cgit v1.2.3 From 56a08bdcff20f0022bd9160c1093e56f763499aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: extend sysprof plugin some more Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 80 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ba55b871b3d..b1137c11ef8 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -3,7 +3,7 @@ * * Copyright (C) 2007 Steven Rostedt * Copyright (C) 2008 Ingo Molnar - * + * Copyright (C) 2004, 2005, Soeren Sandmann */ #include #include @@ -11,13 +11,17 @@ #include #include #include +#include #include #include "trace.h" -static struct trace_array *ctx_trace; +static struct trace_array *sysprof_trace; static int __read_mostly tracer_enabled; +/* + * 10 msecs for now: + */ static const unsigned long sample_period = 1000000; /* @@ -25,10 +29,78 @@ static const unsigned long sample_period = 1000000; */ static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame))) + return 0; + + return 1; +} + +#define SYSPROF_MAX_ADDRESSES 512 + +static void timer_notify(struct pt_regs *regs, int cpu) +{ + const void __user *frame_pointer; + struct trace_array_cpu *data; + struct stack_frame frame; + struct trace_array *tr; + int is_user; + int i; + + if (!regs) + return; + + tr = sysprof_trace; + data = tr->data[cpu]; + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) { + /* kernel */ + ftrace(tr, data, current->pid, 1, 0); + return; + + } + + trace_special(tr, data, 0, current->pid, regs->ip); + + frame_pointer = (void __user *)regs->bp; + + for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) { + if (!copy_stack_frame(frame_pointer, &frame)) + break; + if ((unsigned long)frame_pointer < regs->sp) + break; + + trace_special(tr, data, 1, frame.return_address, + (unsigned long)frame_pointer); + frame_pointer = frame.next_fp; + } + + trace_special(tr, data, 2, current->pid, i); + + if (i == SYSPROF_MAX_ADDRESSES) + trace_special(tr, data, -1, -1, -1); +} + static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) { /* trace here */ - panic_timeout++; + timer_notify(get_irq_regs(), smp_processor_id()); hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -100,7 +172,7 @@ static notrace void stop_stack_trace(struct trace_array *tr) static notrace void stack_trace_init(struct trace_array *tr) { - ctx_trace = tr; + sysprof_trace = tr; if (tr->ctrl) start_stack_trace(tr); -- cgit v1.2.3 From a6dd24f8d00cbccb560b19a723e6fb9bdfb20799 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: sysprof-plugin, add self-tests Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.h | 4 ++++ kernel/trace/trace_selftest.c | 28 ++++++++++++++++++++++++++++ kernel/trace/trace_sysprof.c | 6 +++--- 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85e94e..b2198bc830a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -280,6 +280,10 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace, extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); #endif +#ifdef CONFIG_SYSPROF_TRACER +extern int trace_selftest_startup_sysprof(struct tracer *trace, + struct trace_array *tr); +#endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3877dd9102f..033a6fb2e5f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -537,3 +537,31 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr return ret; } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_SYSPROF_TRACER +int +trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SYSPROF_TRACER */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index b1137c11ef8..b78f12f77fc 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -126,7 +126,7 @@ static void start_stack_timers(void) for_each_online_cpu(cpu) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); - printk("started timer on cpu%d\n", cpu); + printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu); } set_cpus_allowed_ptr(current, &saved_mask); } @@ -136,7 +136,7 @@ static void stop_stack_timer(int cpu) struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); hrtimer_cancel(hrtimer); - printk("cancelled timer on cpu%d\n", cpu); + printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu); } static void stop_stack_timers(void) @@ -200,7 +200,7 @@ static struct tracer stack_trace __read_mostly = .reset = stack_trace_reset, .ctrl_update = stack_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_stack, + .selftest = trace_selftest_startup_sysprof, #endif }; -- cgit v1.2.3 From 842af315e8b0adad58fc642eaa5e6f53525e0534 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: sysprof plugin improvement add sample maximum depth. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index b78f12f77fc..7f6fcccffb8 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -23,6 +23,7 @@ static int __read_mostly tracer_enabled; * 10 msecs for now: */ static const unsigned long sample_period = 1000000; +static const unsigned int sample_max_depth = 512; /* * Per CPU hrtimers that do the profiling: @@ -45,8 +46,6 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return 1; } -#define SYSPROF_MAX_ADDRESSES 512 - static void timer_notify(struct pt_regs *regs, int cpu) { const void __user *frame_pointer; @@ -80,7 +79,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) frame_pointer = (void __user *)regs->bp; - for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) { + for (i = 0; i < sample_max_depth; i++) { if (!copy_stack_frame(frame_pointer, &frame)) break; if ((unsigned long)frame_pointer < regs->sp) @@ -93,7 +92,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) trace_special(tr, data, 2, current->pid, i); - if (i == SYSPROF_MAX_ADDRESSES) + if (i == sample_max_depth) trace_special(tr, data, -1, -1, -1); } @@ -126,7 +125,6 @@ static void start_stack_timers(void) for_each_online_cpu(cpu) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); - printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu); } set_cpus_allowed_ptr(current, &saved_mask); } @@ -136,7 +134,6 @@ static void stop_stack_timer(int cpu) struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); hrtimer_cancel(hrtimer); - printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu); } static void stop_stack_timers(void) -- cgit v1.2.3 From ef4ab15ff34fd9c65e92bee70f58e7179da881c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: make sysprof dependent on x86 for now that's the only tested platform for now. If there's interest we can make it generic easily. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 2 +- kernel/trace/trace_selftest.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e101c9a85f0..9b49526ac0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && X86 select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 033a6fb2e5f..5588ecc4098 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -557,11 +557,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) ret = trace_test_buffer(tr, &count); trace->reset(tr); - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - return ret; } #endif /* CONFIG_SYSPROF_TRACER */ -- cgit v1.2.3 From 9f6b4e3f4a24f2590f1c96f117fc45fbea9b0fa4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: sysprof fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 7f6fcccffb8..f9a09fe705b 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -37,21 +37,26 @@ struct stack_frame { static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) { + int ret; + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) return 0; - if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame))) - return 0; + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); - return 1; + return ret; } static void timer_notify(struct pt_regs *regs, int cpu) { - const void __user *frame_pointer; struct trace_array_cpu *data; struct stack_frame frame; struct trace_array *tr; + const void __user *fp; int is_user; int i; @@ -77,21 +82,26 @@ static void timer_notify(struct pt_regs *regs, int cpu) trace_special(tr, data, 0, current->pid, regs->ip); - frame_pointer = (void __user *)regs->bp; + fp = (void __user *)regs->bp; for (i = 0; i < sample_max_depth; i++) { - if (!copy_stack_frame(frame_pointer, &frame)) + frame.next_fp = 0; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) break; - if ((unsigned long)frame_pointer < regs->sp) + if ((unsigned long)fp < regs->sp) break; trace_special(tr, data, 1, frame.return_address, - (unsigned long)frame_pointer); - frame_pointer = frame.next_fp; + (unsigned long)fp); + fp = frame.next_fp; } trace_special(tr, data, 2, current->pid, i); + /* + * Special trace entry if we overflow the max depth: + */ if (i == sample_max_depth) trace_special(tr, data, -1, -1, -1); } -- cgit v1.2.3 From d618b3e6e50970a6248ac857653fdd49bcd3c045 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: sysprof updates make the sample period configurable. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 3 ++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_sysprof.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 73 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916ff03..95b7c48a9a1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2800,6 +2800,9 @@ static __init void tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); #endif +#ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); +#endif } static int trace_alloc_page(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b2198bc830a..b7f85d9c80d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -188,6 +188,8 @@ struct trace_iterator { void tracing_reset(struct trace_array_cpu *data); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); +void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index f9a09fe705b..19406236b67 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -20,11 +20,12 @@ static struct trace_array *sysprof_trace; static int __read_mostly tracer_enabled; /* - * 10 msecs for now: + * 1 msec sample interval by default: */ -static const unsigned long sample_period = 1000000; +static unsigned long sample_period = 1000000; static const unsigned int sample_max_depth = 512; +static DEFINE_MUTEX(sample_timer_lock); /* * Per CPU hrtimers that do the profiling: */ @@ -166,15 +167,19 @@ static notrace void stack_reset(struct trace_array *tr) static notrace void start_stack_trace(struct trace_array *tr) { + mutex_lock(&sample_timer_lock); stack_reset(tr); start_stack_timers(); tracer_enabled = 1; + mutex_unlock(&sample_timer_lock); } static notrace void stop_stack_trace(struct trace_array *tr) { + mutex_lock(&sample_timer_lock); stop_stack_timers(); tracer_enabled = 0; + mutex_unlock(&sample_timer_lock); } static notrace void stack_trace_init(struct trace_array *tr) @@ -216,3 +221,64 @@ __init static int init_stack_trace(void) return register_tracer(&stack_trace); } device_initcall(init_stack_trace); + +#define MAX_LONG_DIGITS 22 + +static ssize_t +sysprof_sample_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + int r; + + r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +sysprof_sample_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + unsigned long val; + + if (cnt > MAX_LONG_DIGITS-1) + cnt = MAX_LONG_DIGITS-1; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + /* + * Enforce a minimum sample period of 100 usecs: + */ + if (val < 100) + val = 100; + + mutex_lock(&sample_timer_lock); + stop_stack_timers(); + sample_period = val * 1000; + start_stack_timers(); + mutex_unlock(&sample_timer_lock); + + return cnt; +} + +static struct file_operations sysprof_sample_fops = { + .read = sysprof_sample_read, + .write = sysprof_sample_write, +}; + +void init_tracer_sysprof_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + + entry = debugfs_create_file("sysprof_sample_period", 0644, + d_tracer, NULL, &sysprof_sample_fops); + if (entry) + return; + pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); +} -- cgit v1.2.3 From ada6b835067dc022f11cdae1c313a3710d3d977c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:50:41 +0200 Subject: ftrace: remove notrace Remove the notrace annotations. The build logic takes care of that. Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 19406236b67..3b1e4ba9180 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -155,7 +155,7 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static notrace void stack_reset(struct trace_array *tr) +static void stack_reset(struct trace_array *tr) { int cpu; @@ -165,7 +165,7 @@ static notrace void stack_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_stack_trace(struct trace_array *tr) +static void start_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); stack_reset(tr); @@ -174,7 +174,7 @@ static notrace void start_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static notrace void stop_stack_trace(struct trace_array *tr) +static void stop_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); stop_stack_timers(); @@ -182,7 +182,7 @@ static notrace void stop_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static notrace void stack_trace_init(struct trace_array *tr) +static void stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; @@ -190,7 +190,7 @@ static notrace void stack_trace_init(struct trace_array *tr) start_stack_trace(tr); } -static notrace void stack_trace_reset(struct trace_array *tr) +static void stack_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_stack_trace(tr); -- cgit v1.2.3 From 9caee613d3b860ae81b79370eeae9ac967c07536 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:55:54 +0200 Subject: ftrace: fix __trace_special() Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 3b1e4ba9180..76dd953eecc 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -81,7 +81,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) } - trace_special(tr, data, 0, current->pid, regs->ip); + __trace_special(tr, data, 0, current->pid, regs->ip); fp = (void __user *)regs->bp; @@ -93,18 +93,18 @@ static void timer_notify(struct pt_regs *regs, int cpu) if ((unsigned long)fp < regs->sp) break; - trace_special(tr, data, 1, frame.return_address, + __trace_special(tr, data, 1, frame.return_address, (unsigned long)fp); fp = frame.next_fp; } - trace_special(tr, data, 2, current->pid, i); + __trace_special(tr, data, 2, current->pid, i); /* * Special trace entry if we overflow the max depth: */ if (i == sample_max_depth) - trace_special(tr, data, -1, -1, -1); + __trace_special(tr, data, -1, -1, -1); } static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -- cgit v1.2.3 From 5fc4511c756860149b81aead6eca5bdf5c438ea7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:58:21 +0200 Subject: ftrace: make it more available in the Kconfig Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9b49526ac0b..e101c9a85f0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL && X86 + depends on DEBUG_KERNEL select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace -- cgit v1.2.3 From cd2134b1dda92fd450e6a1e12b1c7960dd6a2178 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Pedersen Date: Mon, 12 May 2008 21:20:54 +0200 Subject: sysprof: kernel trace add kernel backtracing to the sysprof tracer. change the format of the data, so that type=0 means beginning of stack trace, 1 means kernel address, 2 means user address, and 3 means end of trace. EIP addresses are no longer distinguished from return addresses, mostly because sysprof userspace doesn't make use of it. It may be worthwhile adding this back in though, just in case it becomes interesting. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 89 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 76dd953eecc..ebcb66d054c 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -14,6 +14,8 @@ #include #include +#include + #include "trace.h" static struct trace_array *sysprof_trace; @@ -52,6 +54,77 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return ret; } +struct backtrace_info { + struct trace_array_cpu *data; + struct trace_array *tr; + int pos; +}; + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct backtrace_info *info = data; + + if (info->pos < sample_max_depth && reliable) { + __trace_special(info->tr, info->data, 1, addr, 0); + + info->pos++; + } +} + +const static struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static struct pt_regs * +trace_kernel(struct pt_regs *regs, struct trace_array *tr, + struct trace_array_cpu *data) +{ + struct backtrace_info info; + unsigned long bp; + char *user_stack; + char *stack; + + info.tr = tr; + info.data = data; + info.pos = 1; + + __trace_special(info.tr, info.data, 1, regs->ip, 0); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = regs->bp; +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); + + /* Now trace the user stack */ + user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs)); + + return (struct pt_regs *)user_stack; +} + static void timer_notify(struct pt_regs *regs, int cpu) { struct trace_array_cpu *data; @@ -74,17 +147,15 @@ static void timer_notify(struct pt_regs *regs, int cpu) if (is_user && current->state != TASK_RUNNING) return; - if (!is_user) { - /* kernel */ - ftrace(tr, data, current->pid, 1, 0); - return; + __trace_special(tr, data, 0, 0, current->pid); - } - - __trace_special(tr, data, 0, current->pid, regs->ip); + if (!is_user) + regs = trace_kernel(regs, tr, data); fp = (void __user *)regs->bp; + __trace_special(tr, data, 2, regs->ip, 0); + for (i = 0; i < sample_max_depth; i++) { frame.next_fp = 0; frame.return_address = 0; @@ -93,12 +164,12 @@ static void timer_notify(struct pt_regs *regs, int cpu) if ((unsigned long)fp < regs->sp) break; - __trace_special(tr, data, 1, frame.return_address, + __trace_special(tr, data, 2, frame.return_address, (unsigned long)fp); fp = frame.next_fp; } - __trace_special(tr, data, 2, current->pid, i); + __trace_special(tr, data, 3, current->pid, i); /* * Special trace entry if we overflow the max depth: -- cgit v1.2.3 From 8a9e94c1fbfdac45a3b6811b880777c4116aa309 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: sysprof: update copyrights Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ebcb66d054c..fe23d6dba7f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -1,9 +1,9 @@ /* * trace stack traces * + * Copyright (C) 2004-2008, Soeren Sandmann * Copyright (C) 2007 Steven Rostedt * Copyright (C) 2008 Ingo Molnar - * Copyright (C) 2004, 2005, Soeren Sandmann */ #include #include -- cgit v1.2.3 From cf3271a73b612a03da00681ecd9bfefab37c74c9 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Mon, 12 May 2008 05:28:50 +0200 Subject: ftrace/sysprof: don't trace the user stack if we are a kernel thread. Check that current->mm is non-NULL before attempting to trace the user stack. Also take depth of the kernel stack into account when comparing against sample_max_depth. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sysprof.c | 50 +++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index fe23d6dba7f..2301e1e7c60 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -95,13 +95,12 @@ const static struct stacktrace_ops backtrace_ops = { .address = backtrace_address, }; -static struct pt_regs * +static int trace_kernel(struct pt_regs *regs, struct trace_array *tr, struct trace_array_cpu *data) { struct backtrace_info info; unsigned long bp; - char *user_stack; char *stack; info.tr = tr; @@ -119,10 +118,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr, dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - /* Now trace the user stack */ - user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs)); - - return (struct pt_regs *)user_stack; + return info.pos; } static void timer_notify(struct pt_regs *regs, int cpu) @@ -150,32 +146,44 @@ static void timer_notify(struct pt_regs *regs, int cpu) __trace_special(tr, data, 0, 0, current->pid); if (!is_user) - regs = trace_kernel(regs, tr, data); + i = trace_kernel(regs, tr, data); + else + i = 0; - fp = (void __user *)regs->bp; + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm && i < sample_max_depth) { + regs = (struct pt_regs *)current->thread.sp0 - 1; - __trace_special(tr, data, 2, regs->ip, 0); + fp = (void __user *)regs->bp; - for (i = 0; i < sample_max_depth; i++) { - frame.next_fp = 0; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; + __trace_special(tr, data, 2, regs->ip, 0); - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - } + while (i < sample_max_depth) { + frame.next_fp = 0; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; - __trace_special(tr, data, 3, current->pid, i); + __trace_special(tr, data, 2, frame.return_address, + (unsigned long)fp); + fp = frame.next_fp; + + i++; + } + + } /* * Special trace entry if we overflow the max depth: */ if (i == sample_max_depth) __trace_special(tr, data, -1, -1, -1); + + __trace_special(tr, data, 3, current->pid, i); } static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -- cgit v1.2.3 From bfeeeeb991cf75081e6c2f74d44ae5da05b50a94 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 12 May 2008 21:21:14 +0200 Subject: stacktrace: don't crash on invalid stack trace structs This patch makes the stacktrace printout code \warn when the entries pointer is unset rather than crashing when trying to access it in an attempt to make it a bit more robust. I was saving a stacktrace into an skb and forgot to copy it across skb copies... I have since fixed the code, but it would have been easier had the kernel not crashed in an interrupt. Signed-off-by: Johannes Berg Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/stacktrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index b71816e47a3..0914d0cbc83 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -13,6 +13,9 @@ void print_stack_trace(struct stack_trace *trace, int spaces) { int i, j; + if (WARN_ON(!trace->entries)) + return; + for (i = 0; i < trace->nr_entries; i++) { unsigned long ip = trace->entries[i]; -- cgit v1.2.3 From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: add mmiotrace plugin On Sat, 22 Mar 2008 13:07:47 +0100 Ingo Molnar wrote: > > > i'd suggest the following: pull x86.git and sched-devel.git into a > > > single tree [the two will combine without rejects]. Then try to add a > > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c > > > plugin might be a good example. > > > > I did this and now I have mmiotrace enabled/disabled via the tracing > > framework (what do we call this, since ftrace is one of the tracers?). > > cool! could you send the patches for that? (even if they are not fully > functional yet) Patch attached in the end. Nice to see how much code disappeared. I tried to mark all the features I had to break with XXX-comments. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/Makefile | 1 + kernel/trace/trace_mmiotrace.c | 84 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 kernel/trace/trace_mmiotrace.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d9efbbfa2bd..c44a7dce908 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 00000000000..e4dd03cc5aa --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,84 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include + +#include "trace.h" + +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +static struct trace_array *mmio_trace_array; + + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) + enable_mmiotrace(); +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + enable_mmiotrace(); + else + disable_mmiotrace(); +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .ctrl_update = mmio_trace_ctrl_update, +}; + +__init static int init_mmio_trace(void) +{ + int ret = init_mmiotrace(); + if (ret) + return ret; + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + + if (!current || current->pid == 0) { + /* + * XXX: This is a problem. We need to able to record, no + * matter what. tracing_generic_entry_update() would crash. + */ + static unsigned limit; + if (limit++ < 12) + pr_err("Error in %s: no current.\n", __func__); + return; + } + if (!tr || !data) { + static unsigned limit; + if (limit++ < 12) + pr_err("%s: no tr or data\n", __func__); + return; + } + __trace_special(tr, data, type, addr, arg); +} -- cgit v1.2.3 From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: mmiotrace, updates here is a patch that makes mmiotrace work almost well within the tracing framework. The patch applies on top of my previous patch. I have my own output formatting in place now. Summary of changes: - fix the NULL dereference that was due to not calling tracing_reset() - add print_line() callback into struct tracer - implement print_line() for mmiotrace, producing up-to-spec text - add my output header, but that is not really called in the right place - rewrote the main structs in mmiotrace - added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP - made some functions in trace.c non-static - check current==NULL in tracing_generic_entry_update() - fix(?) comparison in trace_seq_printf() Things seem to work fine except a few issues. Markers (text lines injected into mmiotrace log) are missing, I did not feel hacking them in before we have variable length entries. My output header is printed only for 'trace' file, but not 'trace_pipe'. For some reason, despite my quick fix, iter->trace is NULL in print_trace_line() when called from 'trace_pipe' file, which means I don't get proper output formatting. I only tried by loading nouveau.ko, which just detects the card, and that is traced fine. I didn't try further. Map, two reads and unmap. Works perfectly. I am missing the information about overflows, I'd prefer to have a counter for lost events. I didn't try, but I guess currently there is no way of knowning when it overflows? So, not too far from being fully operational, it seems :-) And looking at the diffstat, there also is some 700-900 lines of user space code that just became obsolete. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 34 ++++++++++ kernel/trace/trace.h | 14 ++++ kernel/trace/trace_mmiotrace.c | 151 +++++++++++++++++++++++++++++++++-------- 3 files changed, 171 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916ff03..d14fe49e963 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85e94e..0ef9ef74c80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,6 +5,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -14,6 +15,8 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, __TRACE_LAST_TYPE }; @@ -75,6 +78,8 @@ struct trace_entry { struct ctx_switch_entry ctx; struct special_entry special; struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; }; }; @@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + #ifdef CONFIG_FTRACE_STARTUP_TEST #ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index e4dd03cc5aa..3a12b1ad0c6 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -11,19 +11,26 @@ #include "trace.h" -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); - static struct trace_array *mmio_trace_array; +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} static void mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); + } } static void mmio_trace_reset(struct trace_array *tr) @@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr) pr_debug("in %s\n", __func__); if (tr->ctrl) disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; } static void mmio_trace_ctrl_update(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); - else + } else { disable_mmiotrace(); + } +} + +/* XXX: This is not called for trace_pipe file! */ +void mmio_print_header(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + trace_seq_printf(s, "VERSION broken 20070824\n"); + /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, entry->pid); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + 0UL, entry->pid); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, entry->pid); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } } static struct tracer mmio_tracer __read_mostly = @@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .open = mmio_print_header, .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, }; __init static int init_mmio_trace(void) { - int ret = init_mmiotrace(); - if (ret) - return ret; return register_tracer(&mmio_tracer); } device_initcall(init_mmio_trace); -void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} - if (!current || current->pid == 0) { - /* - * XXX: This is a problem. We need to able to record, no - * matter what. tracing_generic_entry_update() would crash. - */ - static unsigned limit; - if (limit++ < 12) - pr_err("Error in %s: no current.\n", __func__); - return; - } - if (!tr || !data) { - static unsigned limit; - if (limit++ < 12) - pr_err("%s: no tr or data\n", __func__); - return; - } - __trace_special(tr, data, type, addr, arg); +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); } -- cgit v1.2.3 From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: mmiotrace update, #2 another weekend, another patch. This should apply on top of my previous patch from March 23rd. Summary of changes: - Print PCI device list in output header - work around recursive probe hits on SMP - refactor dis/arm_kmmio_fault_page() and add check for page levels - remove un/reference_kmmio(), the die notifier hook is registered permanently into the list - explicitly check for single stepping in die notifier callback I have tested this version on my UP Athlon64 desktop with Nouveau, and SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems are 64-bit. One previously unknown bug crept into daylight: the ftrace framework's output routines print the first entry last after buffer has wrapped around. The most important regressions compared to non-ftrace mmiotrace at this time are: - failure of trace_pipe file - illegal lines in output file - unaware of losing data due to buffer full Personally I'd like to see these three solved before submitting to mainline. Other issues may come up once we know when we lose events. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_mmiotrace.c | 47 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3a12b1ad0c6..361472b5788 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace.h" @@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr) } } +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + /* XXX: This is not called for trace_pipe file! */ -void mmio_print_header(struct trace_iterator *iter) +static void mmio_print_header(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION broken 20070824\n"); - /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ + struct pci_dev *dev = NULL; + + trace_seq_printf(s, "VERSION 20070824\n"); + + for_each_pci_dev(dev) + mmio_print_pcidev(s, dev); + /* XXX: return value? What if header is very long? */ } static int mmio_print_rw(struct trace_iterator *iter) -- cgit v1.2.3 From 801a175bf601f9a9d5e86e92dee9adeeb6625da8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:58 +0200 Subject: mmiotrace: ftrace fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d14fe49e963..4dcc4e85c5d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,12 +838,16 @@ void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_RW; entry->mmiorw = *rw; - spin_unlock_irqrestore(&data->lock, irq_flags); + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -854,12 +858,16 @@ void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_MAP; entry->mmiomap = *map; - spin_unlock_irqrestore(&data->lock, irq_flags); + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } -- cgit v1.2.3 From 736ca61fa81874b3fee205a593251b1869d0bcf1 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:59 +0200 Subject: x86 mmiotrace: Do not print bogus pid Non-zero pid indicates the MMIO access originated in user space. We do not catch that kind of accesses yet, so always print zero for now. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/trace/trace_mmiotrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 361472b5788..79be4a18ec1 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -117,20 +117,20 @@ static int mmio_print_rw(struct trace_iterator *iter) ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, rw->phys, - rw->value, rw->pc, entry->pid); + rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, rw->phys, - rw->value, rw->pc, entry->pid); + rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", secs, usec_rem, rw->map_id, rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, - (rw->value >> 0) & 0xff, rw->pc, entry->pid); + (rw->value >> 0) & 0xff, rw->pc, 0); break; default: ret = trace_seq_printf(s, "rw what?\n"); -- cgit v1.2.3 From d0a7e8ca5b996d36219e6fc002907291c8ee677b Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: print header using the read hook. Now the header is printed only for `trace_pipe' file. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 60 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 79be4a18ec1..6d2edbdde93 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,6 +12,10 @@ #include "trace.h" +struct header_iter { + struct pci_dev *dev; +}; + static struct trace_array *mmio_trace_array; static void mmio_reset_data(struct trace_array *tr) @@ -89,17 +93,57 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) return ret; } -/* XXX: This is not called for trace_pipe file! */ -static void mmio_print_header(struct trace_iterator *iter) +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) { + struct header_iter *hiter; struct trace_seq *s = &iter->seq; - struct pci_dev *dev = NULL; trace_seq_printf(s, "VERSION 20070824\n"); - for_each_pci_dev(dev) - mmio_print_pcidev(s, dev); - /* XXX: return value? What if header is very long? */ + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; } static int mmio_print_rw(struct trace_iterator *iter) @@ -190,7 +234,9 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, - .open = mmio_print_header, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, .ctrl_update = mmio_trace_ctrl_update, .print_line = mmio_print_line, }; -- cgit v1.2.3 From 2039238b79b51a50f8477f53f33750e1c3fc146a Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: print overrun counts Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 6d2edbdde93..d0f649a2203 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -17,11 +17,13 @@ struct header_iter { }; static struct trace_array *mmio_trace_array; +static bool overrun_detected; static void mmio_reset_data(struct trace_array *tr) { int cpu; + overrun_detected = false; tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) @@ -124,12 +126,34 @@ static void mmio_close(struct trace_iterator *iter) iter->private = NULL; } +static unsigned long count_overruns(struct trace_iterator *iter) +{ + int cpu; + unsigned long cnt = 0; + for_each_online_cpu(cpu) { + cnt += iter->overrun[cpu]; + iter->overrun[cpu] = 0; + } + return cnt; +} + static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { ssize_t ret; struct header_iter *hiter = iter->private; struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } if (!hiter) return 0; @@ -142,6 +166,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, iter->private = NULL; } +print_out: ret = trace_seq_to_user(s, ubuf, cnt); return (ret == -EBUSY) ? 0 : ret; } -- cgit v1.2.3 From e0fd5c2fa188311667267c02a702ae699a9fc2bd Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: do not print bogus pid for maps either Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index d0f649a2203..3c1dacdc2d8 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -225,12 +225,12 @@ static int mmio_print_map(struct trace_iterator *iter) ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, m->phys, m->virt, m->len, - 0UL, entry->pid); + 0UL, 0); break; case MMIO_UNPROBE: ret = trace_seq_printf(s, "UNMAP %lu.%06lu %d 0x%lx %d\n", - secs, usec_rem, m->map_id, 0UL, entry->pid); + secs, usec_rem, m->map_id, 0UL, 0); break; default: ret = trace_seq_printf(s, "map what?\n"); -- cgit v1.2.3 From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: x86 mmiotrace: use resource_size_t for phys addresses Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3c1dacdc2d8..b13dc19dcbb 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_READ: ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", - secs, usec_rem, rw->map_id, rw->phys, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, (rw->value >> 0) & 0xff, rw->pc, 0); break; @@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", - secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: -- cgit v1.2.3 From 4d2df795f0c3eb91f97a666f47716121a2f166ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:00:46 +0200 Subject: sysprof: make it depend on X86 Signed-off-by: Thomas Gleixner --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e101c9a85f0..263e9e6bbd6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL + depends on X86 select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace -- cgit v1.2.3 From 81d50bb254ed53a0da45a65988e4e1fa08e8a541 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 15 May 2008 19:42:49 -0700 Subject: posix-timers: print RT watchdog message It's useful to detect which process is killed by RT watchdog. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index f1525ad06cb..c42a03aef36 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1037,6 +1037,9 @@ static void check_thread_timers(struct task_struct *tsk, sig->rlim[RLIMIT_RTTIME].rlim_cur += USEC_PER_SEC; } + printk(KERN_INFO + "RT Watchdog Timeout: %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } -- cgit v1.2.3 From 7f6f3a39d258adf51f0fb1fe0dab52272a1c61a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:12:18 +0200 Subject: namespacecheck: fix kernel printk.c Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..b620e3d9613 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -231,7 +231,7 @@ static inline void boot_delay_msec(void) /* * Return the number of unread characters in the log buffer. */ -int log_buf_get_len(void) +static int log_buf_get_len(void) { return logged_chars; } @@ -270,7 +270,7 @@ int log_buf_copy(char *dest, int idx, int len) /* * Extract a single character from the log buffer. */ -int log_buf_read(int idx) +static int log_buf_read(int idx) { char ret; -- cgit v1.2.3 From 42fdfa238a23643226910acf922ea930b3286032 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:14:51 +0200 Subject: namespacecheck: more kernel/printk.c fixes [ Stephen Rothwell : build fix ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index b620e3d9613..55d16e57499 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -267,19 +267,6 @@ int log_buf_copy(char *dest, int idx, int len) return ret; } -/* - * Extract a single character from the log buffer. - */ -static int log_buf_read(int idx) -{ - char ret; - - if (log_buf_copy(&ret, idx, 1) == 1) - return ret; - else - return -1; -} - /* * Commands to do_syslog: * -- cgit v1.2.3 From 3b8945e8d40645eecdb7d2357ca531f9b4dd9f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 May 2008 21:21:04 +0200 Subject: printk: clean up recursion check related static variables Make printk_recursion_bug_msg static and drop printk prefix from recursion variables. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 55d16e57499..8b42f87e311 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -652,16 +652,14 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) spin_unlock(&logbuf_lock); return retval; } - -const char printk_recursion_bug_msg [] = - KERN_CRIT "BUG: recent printk recursion!\n"; -static int printk_recursion_bug; +static const char recursion_bug_msg [] = + KERN_CRIT "BUG: recent printk recursion!\n"; +static int recursion_bug; +static int log_level_unknown = 1; +static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) { - static int log_level_unknown = 1; - static char printk_buf[1024]; - unsigned long flags; int printed_len = 0; int this_cpu; @@ -686,7 +684,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * it can be printed at the next appropriate moment: */ if (!oops_in_progress) { - printk_recursion_bug = 1; + recursion_bug = 1; goto out_restore_irqs; } zap_locks(); @@ -696,10 +694,10 @@ asmlinkage int vprintk(const char *fmt, va_list args) spin_lock(&logbuf_lock); printk_cpu = this_cpu; - if (printk_recursion_bug) { - printk_recursion_bug = 0; - strcpy(printk_buf, printk_recursion_bug_msg); - printed_len = sizeof(printk_recursion_bug_msg); + if (recursion_bug) { + recursion_bug = 0; + strcpy(printk_buf, recursion_bug_msg); + printed_len = sizeof(recursion_bug_msg); } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, -- cgit v1.2.3 From cd3a1b8562d28490b334a61d5eb05df3d722d91e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 May 2008 21:21:04 +0200 Subject: printk: don't prefer unsuited consoles on registration console election: If some console happens to be registered first which does not provide a tty binding (!console->device), it prevents that more suited consoles which are registered later on can enter the candidate pool for console_device(). This is observable with KGDB's console which may already be registered (and exploited!) during early debugger connections, that is before any regular console registration. This patch fixes the issue by postponing the final, automated preferred_console selection until someone with a non-NULL device handler comes around. Signed-off-by: Jan Kiszka Cc: Jason Wessel Cc: Gerd Hoffmann Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 8b42f87e311..7d555615223 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1157,8 +1157,11 @@ void register_console(struct console *console) console->index = 0; if (console->setup == NULL || console->setup(console, NULL) == 0) { - console->flags |= CON_ENABLED | CON_CONSDEV; - preferred_console = 0; + console->flags |= CON_ENABLED; + if (console->device) { + console->flags |= CON_CONSDEV; + preferred_console = 0; + } } } -- cgit v1.2.3 From ac60ad7413ca8208094609a3b88ed9b1ed012fbc Mon Sep 17 00:00:00 2001 From: Nick Andrew Date: Mon, 12 May 2008 21:21:04 +0200 Subject: printk: refactor processing of line severity tokens Restructure the logic of vprintk() so the processing of the leading 3 characters of each input line is in one place, regardless whether printk_time is enabled. This makes the code smaller and easier to understand. size reduction in kernel/printk.o: text data bss dec hex filename 6157 397 1049804 1056358 101e66 printk.o.before 6117 397 1049804 1056318 101e3e printk.o.after and some style uncleanlinesses removed as well as a side-effect: Before: total: 19 errors, 22 warnings, 1340 lines checked After: total: 17 errors, 22 warnings, 1333 lines checked Signed-off-by: Nick Andrew Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 61 +++++++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 7d555615223..98ca1b76277 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -655,13 +655,13 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) static const char recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int recursion_bug; -static int log_level_unknown = 1; + static int new_text_line = 1; static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) { - unsigned long flags; int printed_len = 0; + unsigned long flags; int this_cpu; char *p; @@ -703,61 +703,54 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); + /* * Copy the output into log_buf. If the caller didn't provide * appropriate log level tags, we insert them here */ for (p = printk_buf; *p; p++) { - if (log_level_unknown) { - /* log_level_unknown signals the start of a new line */ + if (new_text_line) { + int current_log_level = default_message_loglevel; + /* If a token, set current_log_level and skip over */ + if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' && + p[2] == '>') { + current_log_level = p[1] - '0'; + p += 3; + printed_len -= 3; + } + + /* Always output the token */ + emit_log_char('<'); + emit_log_char(current_log_level + '0'); + emit_log_char('>'); + printed_len += 3; + new_text_line = 0; + if (printk_time) { - int loglev_char; + /* Follow the token with the time */ char tbuf[50], *tp; unsigned tlen; unsigned long long t; unsigned long nanosec_rem; - /* - * force the log level token to be - * before the time output. - */ - if (p[0] == '<' && p[1] >='0' && - p[1] <= '7' && p[2] == '>') { - loglev_char = p[1]; - p += 3; - printed_len -= 3; - } else { - loglev_char = default_message_loglevel - + '0'; - } t = cpu_clock(printk_cpu); nanosec_rem = do_div(t, 1000000000); - tlen = sprintf(tbuf, - "<%c>[%5lu.%06lu] ", - loglev_char, - (unsigned long)t, - nanosec_rem/1000); + tlen = sprintf(tbuf, "[%5lu.%06lu] ", + (unsigned long) t, + nanosec_rem / 1000); for (tp = tbuf; tp < tbuf + tlen; tp++) emit_log_char(*tp); printed_len += tlen; - } else { - if (p[0] != '<' || p[1] < '0' || - p[1] > '7' || p[2] != '>') { - emit_log_char('<'); - emit_log_char(default_message_loglevel - + '0'); - emit_log_char('>'); - printed_len += 3; - } } - log_level_unknown = 0; + if (!*p) break; } + emit_log_char(*p); if (*p == '\n') - log_level_unknown = 1; + new_text_line = 1; } /* -- cgit v1.2.3 From 091593080533a752ce66d12858d8c4105c8e1793 Mon Sep 17 00:00:00 2001 From: Nick Andrew Date: Mon, 12 May 2008 21:21:04 +0200 Subject: printk: remember the message level for multi-line output printk(KERN_ALERT "Danger Will Robinson!\nAlien Approaching!\n"); At present this will result in one message at ALERT level and one at the current default message loglevel (e.g. WARNING). This is non-intuitive. Modify vprintk() to remember the message loglevel each time it is specified and use it for subsequent lines of output which do not specify one, within the same call to printk. Signed-off-by: Nick Andrew Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 98ca1b76277..475fc22a285 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -661,6 +661,7 @@ static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; + int current_log_level = default_message_loglevel; unsigned long flags; int this_cpu; char *p; @@ -710,7 +711,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) */ for (p = printk_buf; *p; p++) { if (new_text_line) { - int current_log_level = default_message_loglevel; /* If a token, set current_log_level and skip over */ if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' && p[2] == '>') { -- cgit v1.2.3 From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Thu, 15 May 2008 11:15:37 -0300 Subject: Remove argument from open_softirq which is always NULL As git-grep shows, open_softirq() is always called with the last argument being NULL block/blk-core.c: open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); kernel/hrtimer.c: open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); kernel/rcuclassic.c: open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); kernel/rcupreempt.c: open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); kernel/softirq.c: open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); kernel/softirq.c: open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); This observation has already been made by Matthew Wilcox in June 2002 (http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html) "I notice that none of the current softirq routines use the data element passed to them." and the situation hasn't changed since them. So it appears we can safely remove that extra argument to save 128 (54) bytes of kernel data (text). Signed-off-by: Carlos R. Mafra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 2 +- kernel/rcuclassic.c | 2 +- kernel/rcupreempt.c | 2 +- kernel/sched.c | 2 +- kernel/softirq.c | 7 +++---- kernel/timer.c | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..861b4088092 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1669,7 +1669,7 @@ void __init hrtimers_init(void) (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); #ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); #endif } diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..f6e01f3ae9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -529,7 +529,7 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } static int __cpuinit rcu_cpu_notify(struct notifier_block *self, diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a51..9dd827db359 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1125,7 +1125,7 @@ void __init __rcu_init(void) for_each_online_cpu(cpu) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } /* diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a9153..56ea3a203a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8154,7 +8154,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #endif #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e06174004..059256874e9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -347,9 +347,8 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } -void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) +void open_softirq(int nr, void (*action)(struct softirq_action *)) { - softirq_vec[nr].data = data; softirq_vec[nr].action = action; } @@ -503,8 +502,8 @@ void __init softirq_init(void) &per_cpu(tasklet_hi_vec, cpu).head; } - open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); - open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); + open_softirq(HI_SOFTIRQ, tasklet_hi_action); } static int ksoftirqd(void * __bind_cpu) diff --git a/kernel/timer.c b/kernel/timer.c index ceacc662657..b4da888497f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1502,7 +1502,7 @@ void __init init_timers(void) BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } /** -- cgit v1.2.3 From c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:14 +0200 Subject: sched: do not trace sched_clock The tracer uses sched_clock, so do not trace it. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e2e985eeee7..6590a828138 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -884,12 +884,12 @@ static unsigned long long __cpu_clock(int cpu) * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ -unsigned long long cpu_clock(int cpu) +unsigned long long notrace cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; @@ -898,7 +898,7 @@ unsigned long long cpu_clock(int cpu) time = __sync_cpu_clock(time, cpu); per_cpu(prev_cpu_time, cpu) = time; } - local_irq_restore(flags); + raw_local_irq_restore(flags); return time; } -- cgit v1.2.3 From 19384c0314342222b18d4c7f09cdce1ca74dfd2a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:16 -0400 Subject: ftrace: limit use of check pages The check_pages function is called often enough that it can cause problems with trace outputs or even bringing the system to a halt. This patch limits the check_pages to the places that are most likely to have problems. The check is made at the flip between the global array and the max save array, as well as when the size of the buffers changes and the self tests. This patch also removes the BUG_ON from check_pages and replaces it with a WARN_ON and disabling of the tracer. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 32 +++++++++++++++++++++++--------- kernel/trace/trace_selftest.c | 1 + 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916ff03..0567f51bbea 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -249,24 +249,32 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +#define CHECK_COND(cond) \ + if (unlikely(cond)) { \ + tracing_disabled = 1; \ + WARN_ON(1); \ + return -1; \ + } + /** * check_pages - integrity check of trace buffers * * As a safty measure we check to make sure the data pages have not - * been corrupted. TODO: configure to disable this because it adds - * a bit of overhead. + * been corrupted. */ -void check_pages(struct trace_array_cpu *data) +int check_pages(struct trace_array_cpu *data) { struct page *page, *tmp; - BUG_ON(data->trace_pages.next->prev != &data->trace_pages); - BUG_ON(data->trace_pages.prev->next != &data->trace_pages); + CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); + CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - BUG_ON(page->lru.next->prev != &page->lru); - BUG_ON(page->lru.prev->next != &page->lru); + CHECK_COND(page->lru.next->prev != &page->lru); + CHECK_COND(page->lru.prev->next != &page->lru); } + + return 0; } /** @@ -280,7 +288,6 @@ void *head_page(struct trace_array_cpu *data) { struct page *page; - check_pages(data); if (list_empty(&data->trace_pages)) return NULL; @@ -2566,7 +2573,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret; + int i, ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2635,8 +2642,15 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, trace_free_page(); } + /* check integrity */ + for_each_tracing_cpu(i) + check_pages(global_trace.data[i]); + filp->f_pos += cnt; + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; out: max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3877dd9102f..18c5423bc97 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -28,6 +28,7 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) page = list_entry(data->trace_pages.next, struct page, lru); entries = page_address(page); + check_pages(data); if (head_page(data) != entries) goto failed; -- cgit v1.2.3 From 4902f8849da6d2805bd291551a6dfd48f1b4f604 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:18 -0400 Subject: ftrace: move ftrace_special to trace.c Move the ftrace_special out of sched_switch to trace.c. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++-- kernel/trace/trace_sched_switch.c | 24 ------------------------ 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0567f51bbea..583fe24903d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -941,6 +941,30 @@ tracing_sched_wakeup_trace(struct trace_array *tr, trace_wake_up(); } +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + #ifdef CONFIG_FTRACE static void function_trace_call(unsigned long ip, unsigned long parent_ip) @@ -2941,8 +2965,6 @@ __init static int tracer_alloc_buffers(void) int ret = -ENOMEM; int i; - global_trace.ctrl = tracer_enabled; - /* TODO: make the number of buffers hot pluggable with CPUS */ tracing_nr_buffers = num_possible_cpus(); tracing_buffer_mask = cpu_possible_map; @@ -3012,6 +3034,7 @@ __init static int tracer_alloc_buffers(void) current_trace = &no_tracer; /* All seems OK, enable tracing */ + global_trace.ctrl = tracer_enabled; tracing_disabled = 0; return 0; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index d25ffa5eaf2..798ec0dc863 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -125,30 +125,6 @@ wake_up_callback(void *probe_data, void *call_data, wakeup_func(probe_data, __rq, task, curr); } -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = ctx_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (!tracer_enabled) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - static void sched_switch_reset(struct trace_array *tr) { int cpu; -- cgit v1.2.3 From 7e18d8e701b6798a5df11e0a16881a60ab1018b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:19 -0400 Subject: ftrace: add function tracing to wake up tracing This patch adds function tracing to the functions that are called on the CPU of the task being traced. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner --- kernel/trace/trace_sched_wakeup.c | 67 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5d2fb48e47f..bf7e91caef5 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -30,6 +30,69 @@ static DEFINE_SPINLOCK(wakeup_lock); static void __wakeup_reset(struct trace_array *tr); +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int resched; + int cpu; + + if (likely(!wakeup_task)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + if (unlikely(!wakeup_task)) + goto unlock; + + /* + * The task can't disappear because it needs to + * wake up first, and we have the wakeup_lock. + */ + if (task_cpu(wakeup_task) != cpu) + goto unlock; + + trace_function(tr, data, ip, parent_ip, flags); + + unlock: + spin_unlock_irqrestore(&wakeup_lock, flags); + + out: + atomic_dec(&data->disabled); + + /* + * To prevent recursion from the scheduler, if the + * resched flag was set before we entered, then + * don't reschedule. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + /* * Should this new latency be reported/recorded? */ @@ -73,7 +136,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (next != wakeup_task) return; - /* The task we are waitng for is waking up */ + /* The task we are waiting for is waking up */ data = tr->data[wakeup_cpu]; /* disable local data, not wakeup_cpu data */ @@ -290,6 +353,7 @@ static void start_wakeup_tracer(struct trace_array *tr) smp_wmb(); tracer_enabled = 1; + register_ftrace_function(&trace_ops); return; fail_deprobe_wake_new: @@ -305,6 +369,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); marker_probe_unregister("kernel_sched_schedule", sched_switch_callback, &wakeup_trace); -- cgit v1.2.3 From da89a7a2536c46e76a1a4351a70a8b8417e5fed1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:20 -0400 Subject: ftrace: remove printks from irqsoff trace Printing out new max latencies was fine for the old RT tracer. But for mainline it is a bit messy. We also need to test if the run queue is locked before we can do the print. This means that we may not be printing out latencies if the run queue is locked on another CPU. This produces inconsistencies in the output. This patch simply removes the print altogether. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner --- kernel/trace/trace_irqsoff.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 761f3ec66c5..421d6fe3650 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -165,22 +165,6 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (!runqueue_is_locked()) { - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical" - " section violates %lu us threshold.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us" - " maximum-latency critical section.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency); - } - } - max_sequence++; out_unlock: -- cgit v1.2.3 From 41c52c0db9607e59f90da7da5309489fa06e887f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 11:46:33 -0400 Subject: ftrace: set_ftrace_notrace feature While debugging latencies in the RT kernel, I found that it would be nice to be able to filter away functions from the trace than just to filter on functions. I added a new interface to the debugfs tracing directory called set_ftrace_notrace When dynamic frace is enabled, this lets you filter away functions that will not be recorded in the trace. It is similar to adding 'notrace' to those functions but by doing it without recompiling the kernel. Here's how set_ftrace_filter and set_ftrace_notrace interact. Remember, if set_ftrace_filter is set, it removes all functions from the trace execpt for those listed in the set_ftrace_filter. set_ftrace_notrace will prevent those functions from being traced. If you were to set one function in both set_ftrace_filter and set_ftrace_notrace and that function was the same, then you would end up with an empty trace. the set of functions to trace is: set_ftrace_filter == empty then all functions not in set_ftrace_notrace else set of the set_ftrace_filter and not in set of set_ftrace_notrace. Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 170 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 130 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89bd9a6f52e..2552454609c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -170,7 +170,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); -static DEFINE_MUTEX(ftrace_filter_lock); +static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; @@ -337,13 +337,12 @@ static void __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { - unsigned long ip; + unsigned long ip, fl; int failed; ip = rec->ip; if (ftrace_filtered && enable) { - unsigned long fl; /* * If filtering is on: * @@ -356,13 +355,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * If this record is not set to be filtered * and it is not enabled do nothing. * + * If this record is set not to trace then + * do nothing. + * * If this record is not set to be filtered and * it is enabled, disable it. */ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == 0)) + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) return; /* @@ -380,9 +382,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } else { - if (enable) + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + new = ftrace_call_replace(ip, FTRACE_ADDR); - else + } else old = ftrace_call_replace(ip, FTRACE_ADDR); if (enable) { @@ -721,6 +731,7 @@ static int __init ftrace_dyn_table_alloc(void) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -754,7 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FAILED) || ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER))) { + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; goto retry; } @@ -847,22 +860,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } -static void ftrace_filter_reset(void) +static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 0; + if (enable) + ftrace_filtered = 0; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; if (rec->flags & FTRACE_FL_FAILED) continue; - rec->flags &= ~FTRACE_FL_FILTER; + rec->flags &= ~type; } pg = pg->next; } @@ -870,7 +885,7 @@ static void ftrace_filter_reset(void) } static int -ftrace_filter_open(struct inode *inode, struct file *file) +ftrace_regex_open(struct inode *inode, struct file *file, int enable) { struct ftrace_iterator *iter; int ret = 0; @@ -882,15 +897,16 @@ ftrace_filter_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && !(file->f_flags & O_APPEND)) - ftrace_filter_reset(); + ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; iter->pos = -1; - iter->flags = FTRACE_ITER_FILTER; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -900,13 +916,25 @@ ftrace_filter_open(struct inode *inode, struct file *file) kfree(iter); } else file->private_data = iter; - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + static ssize_t -ftrace_filter_read(struct file *file, char __user *ubuf, +ftrace_regex_read(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) { if (file->f_mode & FMODE_READ) @@ -916,7 +944,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf, } static loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -936,13 +964,14 @@ enum { }; static void -ftrace_match(unsigned char *buff, int len) +ftrace_match(unsigned char *buff, int len, int enable) { char str[KSYM_SYMBOL_LEN]; char *search = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; for (i = 0; i < len; i++) { @@ -966,7 +995,8 @@ ftrace_match(unsigned char *buff, int len) /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 1; + if (enable) + ftrace_filtered = 1; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { @@ -997,7 +1027,7 @@ ftrace_match(unsigned char *buff, int len) break; } if (matched) - rec->flags |= FTRACE_FL_FILTER; + rec->flags |= flag; } pg = pg->next; } @@ -1005,8 +1035,8 @@ ftrace_match(unsigned char *buff, int len) } static ssize_t -ftrace_filter_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; char ch; @@ -1016,7 +1046,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (!cnt || cnt < 0) return 0; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; @@ -1045,7 +1075,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, cnt--; } - if (isspace(ch)) { file->f_pos += read; ret = read; @@ -1072,7 +1101,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (isspace(ch)) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); iter->buffer_idx = 0; } else iter->flags |= FTRACE_ITER_CONT; @@ -1082,11 +1111,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, ret = read; out: - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @buf - the string that holds the function filter text. @@ -1098,24 +1155,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - if (unlikely(ftrace_disabled)) - return; + ftrace_set_regex(buf, len, reset, 1); +} - mutex_lock(&ftrace_filter_lock); - if (reset) - ftrace_filter_reset(); - if (buf) - ftrace_match(buf, len); - mutex_unlock(&ftrace_filter_lock); +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); } static int -ftrace_filter_release(struct inode *inode, struct file *file) +ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; @@ -1126,7 +1190,7 @@ ftrace_filter_release(struct inode *inode, struct file *file) if (iter->buffer_idx) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); } mutex_lock(&ftrace_sysctl_lock); @@ -1137,10 +1201,22 @@ ftrace_filter_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_sysctl_lock); kfree(iter); - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return 0; } +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1150,12 +1226,20 @@ static struct file_operations ftrace_avail_fops = { static struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, - .read = ftrace_filter_read, + .read = ftrace_regex_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions * @@ -1239,6 +1323,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); return 0; } -- cgit v1.2.3 From 41bc8144d02028133bcd1d545023c6f49e8b2411 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 11:49:22 -0400 Subject: ftrace: fix up cmdline recording The new work with converting the trace hooks over to markers broke the command line recording of ftrace. This patch fixes it again. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 3 --- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_functions.c | 4 ++-- kernel/trace/trace_sched_switch.c | 21 +++++++++++++++------ 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 583fe24903d..0feae23d989 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,9 +652,6 @@ static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; static DEFINE_SPINLOCK(trace_cmdline_lock); -/* trace in all context switches */ -atomic_t trace_record_cmdline_enabled __read_mostly; - /* temporary disable recording */ atomic_t trace_record_cmdline_disabled __read_mostly; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85e94e..6b8bd8800d0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -218,6 +218,8 @@ void trace_function(struct trace_array *tr, void tracing_start_function_trace(void); void tracing_stop_function_trace(void); +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); @@ -226,8 +228,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; -extern atomic_t trace_record_cmdline_enabled; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0a084656d7c..7ee7dcd76b7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -29,14 +29,14 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { function_reset(tr); - atomic_inc(&trace_record_cmdline_enabled); + tracing_start_cmdline_record(); tracing_start_function_trace(); } static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); - atomic_dec(&trace_record_cmdline_enabled); + tracing_stop_cmdline_record(); } static void function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 798ec0dc863..c16935d3bc5 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,6 +29,9 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, long disabled; int cpu; + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + if (!tracer_enabled) return; @@ -63,8 +66,6 @@ sched_switch_callback(void *probe_data, void *call_data, prev = va_arg(*args, typeof(prev)); next = va_arg(*args, typeof(next)); - tracing_record_cmdline(prev); - /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -213,18 +214,26 @@ void tracing_stop_sched_switch(void) tracing_sched_unregister(); } +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; - tracing_start_sched_switch(); + tracing_start_cmdline_record(); } static void stop_sched_trace(struct trace_array *tr) { - tracing_stop_sched_switch(); - atomic_dec(&trace_record_cmdline_enabled); + tracing_stop_cmdline_record(); tracer_enabled = 0; } -- cgit v1.2.3 From ffdaa3582b6b39d625d585d07e329ffdc925e971 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 24 May 2008 23:45:02 +0530 Subject: ftrace: safe traversal of ftrace_hash hlist Hi Steven, I noticed that concurrent instances of ftrace_record_ip() have a race between ftrace_hash list traversal during ftrace_ip_in_hash() (before acquiring ftrace_shutdown_lock) and ftrace_add_hash(). If it's so then this should fix it. Signed-off-by: Abhishek Sagar Cc: rostedt@goodmis.org Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2552454609c..9b7c54f8a62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -201,7 +201,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) struct hlist_node *t; int found = 0; - hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { if (p->ip == ip) { found = 1; break; @@ -214,7 +214,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) static inline void ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) { - hlist_add_head(&node->node, &ftrace_hash[key]); + hlist_add_head_rcu(&node->node, &ftrace_hash[key]); } static void ftrace_free_rec(struct dyn_ftrace *rec) -- cgit v1.2.3 From 492a7ea5bcf263ee02a9eb6a3ab0222a1946fade Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 25 May 2008 00:10:04 +0530 Subject: ftrace: fix updating of ftrace_update_cnt Hi Ingo/Steven, Ftrace currently maintains an update count which includes false updates, i.e, updates which failed. If anything, such failures should be tracked by some separate variable, but this patch provides a minimal fix. Signed-off-by: Abhishek Sagar Cc: rostedt@goodmis.org Signed-off-by: Thomas Gleixner --- kernel/trace/ftrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b7c54f8a62..1843edc098a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -453,7 +453,7 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static void +static int ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; @@ -469,7 +469,9 @@ ftrace_code_disable(struct dyn_ftrace *rec) if (failed) { rec->flags |= FTRACE_FL_FAILED; ftrace_free_rec(rec); + return 0; } + return 1; } static int __ftrace_modify_code(void *data) @@ -617,8 +619,8 @@ static int __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); - ftrace_update_cnt++; + if (ftrace_code_disable(p)) + ftrace_update_cnt++; } } -- cgit v1.2.3 From d031476408ae0f5196e3c579f519dfdefb099b67 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 23 May 2008 14:41:19 +0100 Subject: hrtimer: remove warning in hres_timers_resume hres_timers_resume() warns if there appears to be more than one cpu online. This warning makes sense when the suspend/resume mechanism offlines all cpus but one during the suspend/resume process. However, Xen suspend does not need to offline the other cpus; it merely keeps them tied up in stop_machine() while the virtual machine is suspended. The warning hres_timers_resume issues is therefore spurious. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: "Rafael J. Wysocki" Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..493c4b8b8cb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -632,8 +632,6 @@ void clock_was_set(void) */ void hres_timers_resume(void) { - WARN_ON_ONCE(num_online_cpus() > 1); - /* Retrigger the CPU local events: */ retrigger_next_event(NULL); } -- cgit v1.2.3 From 900cfa46191a7d87cf1891924cb90499287fd235 Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Thu, 22 May 2008 19:25:11 -0300 Subject: hrtimer: Remove unused variables in ktime_divns() The variables dns and inc are not used, remove them. Signed-off-by: Carlos R. Mafra Cc: tglx@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 493c4b8b8cb..635739d219d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -300,11 +300,10 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns); */ u64 ktime_divns(const ktime_t kt, s64 div) { - u64 dclc, inc, dns; + u64 dclc; int sft = 0; - dclc = dns = ktime_to_ns(kt); - inc = div; + dclc = ktime_to_ns(kt); /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; -- cgit v1.2.3 From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 26 May 2008 23:31:07 +0100 Subject: xen: Enable console tty by default in domU if it's not a dummy Without console= arguments on the kernel command line, the first console to register becomes enabled and the preferred console (the one behind /dev/console). This is normally tty (assuming CONFIG_VT_CONSOLE is enabled, which it commonly is). This is okay as long tty is a useful console. But unless we have the PV framebuffer, and it is enabled for this domain, tty0 in domU is merely a dummy. In that case, we want the preferred console to be the Xen console hvc0, and we want it without having to fiddle with the kernel command line. Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7 did that for us. Since we now have the PV framebuffer, we want to enable and prefer tty again, but only when PVFB is enabled. But even then we still want to enable the Xen console as well. Problem: when tty registers, we can't yet know whether the PVFB is enabled. By the time we can know (xenstore is up), the console setup game is over. Solution: enable console tty by default, but keep hvc as the preferred console. Change the preferred console to tty when PVFB probes successfully, unless we've been given console kernel parameters. Signed-off-by: Markus Armbruster Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- kernel/printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..028ed75d486 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -121,6 +121,8 @@ struct console_cmdline static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int selected_console = -1; static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -890,6 +892,7 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); -- cgit v1.2.3 From 827e609b4581282b98bdf7666f6e93ff1bd1a63e Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 28 May 2008 12:49:56 -0500 Subject: kgdb: use common ascii helpers and put_unaligned_be32 helper Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Jason Wessel --- kernel/kgdb.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 14787de568b..79e3c90113c 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -52,6 +52,7 @@ #include #include #include +#include static int kgdb_break_asap; @@ -227,8 +228,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs) * GDB remote protocol parser: */ -static const char hexchars[] = "0123456789abcdef"; - static int hex(char ch) { if ((ch >= 'a') && (ch <= 'f')) @@ -316,8 +315,8 @@ static void put_packet(char *buffer) } kgdb_io_ops->write_char('#'); - kgdb_io_ops->write_char(hexchars[checksum >> 4]); - kgdb_io_ops->write_char(hexchars[checksum & 0xf]); + kgdb_io_ops->write_char(hex_asc_hi(checksum)); + kgdb_io_ops->write_char(hex_asc_lo(checksum)); if (kgdb_io_ops->flush) kgdb_io_ops->flush(); @@ -478,8 +477,8 @@ static void error_packet(char *pkt, int error) { error = -error; pkt[0] = 'E'; - pkt[1] = hexchars[(error / 10)]; - pkt[2] = hexchars[(error % 10)]; + pkt[1] = hex_asc[(error / 10)]; + pkt[2] = hex_asc[(error % 10)]; pkt[3] = '\0'; } @@ -510,10 +509,7 @@ static void int_to_threadref(unsigned char *id, int value) scan = (unsigned char *)id; while (i--) *scan++ = 0; - *scan++ = (value >> 24) & 0xff; - *scan++ = (value >> 16) & 0xff; - *scan++ = (value >> 8) & 0xff; - *scan++ = (value & 0xff); + put_unaligned_be32(value, scan); } static struct task_struct *getthread(struct pt_regs *regs, int tid) -- cgit v1.2.3 From ca05a99a54db1db5bca72eccb5866d2a86f8517f Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Tue, 27 May 2008 22:05:17 -0700 Subject: capabilities: remain source compatible with 32-bit raw legacy capability support. Source code out there hard-codes a notion of what the _LINUX_CAPABILITY_VERSION #define means in terms of the semantics of the raw capability system calls capget() and capset(). Its unfortunate, but true. Since the confusing header file has been in a released kernel, there is software that is erroneously using 64-bit capabilities with the semantics of 32-bit compatibilities. These recently compiled programs may suffer corruption of their memory when sys_getcap() overwrites more memory than they are coded to expect, and the raising of added capabilities when using sys_capset(). As such, this patch does a number of things to clean up the situation for all. It 1. forces the _LINUX_CAPABILITY_VERSION define to always retain its legacy value. 2. adopts a new #define strategy for the kernel's internal implementation of the preferred magic. 3. deprecates v2 capability magic in favor of a new (v3) magic number. The functionality of v3 is entirely equivalent to v2, the only difference being that the v2 magic causes the kernel to log a "deprecated" warning so the admin can find applications that may be using v2 inappropriately. [User space code continues to be encouraged to use the libcap API which protects the application from details like this. libcap-2.10 is the first to support v3 capabilities.] Fixes issue reported in https://bugzilla.redhat.com/show_bug.cgi?id=447518. Thanks to Bojan Smojver for the report. [akpm@linux-foundation.org: s/depreciate/deprecate/g] [akpm@linux-foundation.org: be robust about put_user size] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew G. Morgan Cc: Serge E. Hallyn Cc: Bojan Smojver Cc: stable@kernel.org Signed-off-by: Andrew Morton Signed-off-by: Chris Wright --- kernel/capability.c | 111 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 39e8193b41e..cfbe4429948 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -52,6 +52,69 @@ static void warn_legacy_capability_use(void) } } +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + /* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is @@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - __u32 version; struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -118,7 +167,7 @@ out: spin_unlock(&task_capability_lock); if (!ret) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { @@ -128,7 +177,7 @@ out: } /* - * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability @@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - __u32 version; struct task_struct *target; int ret; pid_t pid; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } - while (i < _LINUX_CAPABILITY_U32S) { + while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; -- cgit v1.2.3 From 76094a2cf46e4ab776055d4086615b884408568c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Wed, 28 May 2008 00:03:18 +0530 Subject: ftrace: distinguish kretprobe'd functions in trace logs Tracing functions via ftrace which have a kretprobe installed on them, can produce misleading output in their trace logs. E.g, consider the correct trace of the following sequence: do_IRQ() { ~ irq_enter(); ~ } Trace log (sample): -0 [00] 4154504455.781616: irq_enter <- do_IRQ But if irq_enter() has a kretprobe installed on it, the return value stored on the stack at each invocation is modified to divert the return to a kprobe trampoline function called kretprobe_trampoline(). So with this the trace would (currently) look like: -0 [00] 4154504455.781616: irq_enter <- kretprobe_trampoline Now this is quite misleading to the end user, as it suggests something that didn't actually happen. So just to avoid such misinterpretations, the inlined patch aims to output such a log as: -0 [00] 4154504455.781616: irq_enter <- [unknown/kretprobe'd] Signed-off-by: Abhishek Sagar Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0feae23d989..12f5e817380 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1199,6 +1200,20 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } +#define KRETPROBE_MSG "[unknown/kretprobe'd]" + +#ifdef CONFIG_KRETPROBES +static inline int kretprobed(unsigned long addr) +{ + return addr == (unsigned long)kretprobe_trampoline; +} +#else +static inline int kretprobed(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_KRETPROBES */ + static int seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -1434,7 +1449,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) case TRACE_FN: seq_print_ip_sym(s, entry->fn.ip, sym_flags); trace_seq_puts(s, " ("); - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + if (kretprobed(entry->fn.parent_ip)) + trace_seq_puts(s, KRETPROBE_MSG); + else + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; case TRACE_CTX: @@ -1514,8 +1532,11 @@ static int print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, " <-"); if (!ret) return 0; - ret = seq_print_ip_sym(s, entry->fn.parent_ip, - sym_flags); + if (kretprobed(entry->fn.parent_ip)) + ret = trace_seq_puts(s, KRETPROBE_MSG); + else + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); if (!ret) return 0; } -- cgit v1.2.3 From ad90c0e3ce8d20d6873b57e36181ef6d7a0097fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2008 20:48:37 -0400 Subject: ftrace: user update and disable dynamic ftrace daemon In dynamic ftrace, the mcount function starts off pointing to a stub function that just returns. On start up, the call to the stub is modified to point to a "record_ip" function. The job of the record_ip function is to add the function to a pre-allocated hash list. If the function is already there, it simply is ignored, otherwise it is added to the list. Later, a ftraced daemon wakes up and calls kstop_machine if any functions have been recorded, and changes the calls to the recorded functions to a simple nop. If no functions were recorded, the daemon goes back to sleep. The daemon wakes up once a second to see if it needs to update any newly recorded functions into nops. Usually it does not, but if a lot of code has been executed for the first time in the kernel, the ftraced daemon will call kstop_machine to update those into nops. The problem currently is that there's no way to stop the daemon from doing this, and it can cause unneeded latencies (800us which for some is bothersome). This patch adds a new file /debugfs/tracing/ftraced_enabled. If the daemon is active, reading this will return "enabled\n" and "disabled\n" when the daemon is not running. To disable the daemon, the user can echo "0" or "disable" into this file, and "1" or "enable" to re-enable the daemon. Since the daemon is used to convert the functions into nops to increase the performance of the system, I also added that anytime something is written into the ftraced_enabled file, kstop_machine will run if there are new functions that have been detected that need to be converted. This way the user can disable the daemon but still be able to control the conversion of the mcount calls to nops by simply, "echo 0 > /debugfs/tracing/ftraced_enabled" when they need to do more conversions. To see the number of converted functions: "cat /debugfs/tracing/dyn_ftrace_total_info" Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 157 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 110 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1843edc098a..f762f5a2d33 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE static struct task_struct *ftraced_task; -static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); -static unsigned long ftraced_iteration_counter; enum { FTRACE_ENABLE_CALLS = (1 << 0), @@ -189,6 +187,7 @@ static struct ftrace_page *ftrace_pages; static int ftraced_trigger; static int ftraced_suspend; +static int ftraced_stop; static int ftrace_record_suspend; @@ -474,14 +473,21 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } +static int __ftrace_update_code(void *ignore); + static int __ftrace_modify_code(void *data) { unsigned long addr; int *command = data; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + } else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); if (*command & FTRACE_UPDATE_TRACE_FUNC) @@ -503,6 +509,25 @@ static void ftrace_run_update_code(int command) stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + static ftrace_func_t saved_ftrace_func; static void ftrace_startup(void) @@ -603,6 +628,7 @@ static int __ftrace_update_code(void *ignore) int i; /* Don't be recording funcs now */ + ftrace_record_suspend++; save_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; @@ -628,18 +654,23 @@ static int __ftrace_update_code(void *ignore) stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; return 0; } -static void ftrace_update_code(void) +static int ftrace_update_code(void) { - if (unlikely(ftrace_disabled)) - return; + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; } static int ftraced(void *ignore) @@ -658,14 +689,13 @@ static int ftraced(void *ignore) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { - ftrace_record_suspend++; - ftrace_update_code(); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { usecs = nsecs_to_usecs(ftrace_update_time); if (ftrace_update_tot_cnt > 100000) { ftrace_update_tot_cnt = 0; pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", + " (%lu total) in %lu usec%s\n", ftrace_update_cnt, ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, @@ -673,15 +703,10 @@ static int ftraced(void *ignore) ftrace_disabled = 1; WARN_ON_ONCE(1); } - ftraced_trigger = 0; - ftrace_record_suspend--; } - ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); - wake_up_interruptible(&ftraced_waiters); - ftrace_shutdown_replenish(); } __set_current_state(TASK_RUNNING); @@ -1219,6 +1244,55 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1242,51 +1316,34 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions - * - * The ftrace dynamic update daemon only wakes up once a second. - * There may be cases where an update needs to be done immediately - * for tests or internal kernel tracing to begin. This function - * wakes the daemon to do an update and will not return until the - * update is complete. */ int ftrace_force_update(void) { - unsigned long last_counter; - DECLARE_WAITQUEUE(wait, current); int ret = 0; if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - last_counter = ftraced_iteration_counter; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ftraced_waiters, &wait); - if (unlikely(!ftraced_task)) { - ret = -ENODEV; - goto out; - } - - do { - mutex_unlock(&ftraced_lock); - wake_up_process(ftraced_task); - schedule(); - mutex_lock(&ftraced_lock); - if (signal_pending(current)) { - ret = -EINTR; - break; - } - set_current_state(TASK_INTERRUPTIBLE); - } while (last_counter == ftraced_iteration_counter); + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; - out: mutex_unlock(&ftraced_lock); - remove_wait_queue(&ftraced_waiters, &wait); - set_current_state(TASK_RUNNING); + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1331,6 +1388,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); return 0; } -- cgit v1.2.3 From 18404756765c713a0be4eb1082920c04822ce588 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:02:52 -0700 Subject: genirq: Expose default irq affinity mask (take 3) Current IRQ affinity interface does not provide a way to set affinity for the IRQs that will be allocated/activated in the future. This patch creates /proc/irq/default_smp_affinity that lets users set default affinity mask for the newly allocated IRQs. Changing the default does not affect affinity masks for the currently active IRQs, they have to be changed explicitly. Updated based on Paul J's comments and added some more documentation. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: a.p.zijlstra@chello.nl Cc: tglx@linutronix.de Cc: rdunlap@xenotime.net Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 28 +++++++++++++++++++++++-- kernel/irq/proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46d6611a33b..469814e9b9e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,8 @@ #ifdef CONFIG_SMP +cpumask_t irq_default_affinity = CPU_MASK_ALL; + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) return 0; } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +/* + * Generic version of the affinity autoselector. + */ +int irq_select_affinity(unsigned int irq) +{ + cpumask_t mask; + + if (!irq_can_set_affinity(irq)) + return 0; + + cpus_and(mask, cpu_online_map, irq_default_affinity); + + irq_desc[irq].affinity = mask; + irq_desc[irq].chip->set_affinity(irq, mask); + + set_balance_irq_affinity(irq, mask); + return 0; +} +#endif + #endif /** @@ -382,6 +405,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) } else /* Undo nested disables: */ desc->depth = 1; + + /* Set default affinity mask once everything is setup */ + irq_select_affinity(irq); } /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -571,8 +597,6 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; - select_smp_affinity(irq); - #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c2f2ccb0549..6c6d35d68ee 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { unsigned int irq = (int)(long)data, full_count = count, err; - cpumask_t new_value, tmp; + cpumask_t new_value; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - cpus_and(tmp, new_value, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return select_smp_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : full_count; irq_set_affinity(irq, new_value); return full_count; } +static int default_affinity_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, irq_default_affinity); + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +static int default_affinity_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int full_count = count, err; + cpumask_t new_value; + + err = cpumask_parse_user(buffer, count, new_value); + if (err) + return err; + + if (!is_affinity_mask_valid(new_value)) + return -EINVAL; + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + if (!cpus_intersects(new_value, cpu_online_map)) + return -EINVAL; + + irq_default_affinity = new_value; + + return full_count; +} #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +void register_default_affinity_proc(void) +{ +#ifdef CONFIG_SMP + struct proc_dir_entry *entry; + + /* create /proc/irq/default_smp_affinity */ + entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); + if (entry) { + entry->data = NULL; + entry->read_proc = default_affinity_read; + entry->write_proc = default_affinity_write; + } +#endif +} + void init_irq_proc(void) { int i; @@ -180,6 +229,8 @@ void init_irq_proc(void) if (!root_irq_dir) return; + register_default_affinity_proc(); + /* * Create entries for all existing IRQs. */ -- cgit v1.2.3 From 45c01e824991b2dd0a332e19efc4901acb31209f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:20:41 +0200 Subject: sched: prioritize non-migratable tasks over migratable ones Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm that could allow suboptimal balancing if a non-migratable task gets queued behind a running migratable one. It is discussed in this thread: http://lkml.org/lkml/2008/4/22/296 This issue has been further exacerbated by a recent checkin to sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3). >From a pure priority standpoint, the run-queue is doing the "right" thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1 wakes up at equal or lower priority (affined only to cpu1) later, it *should* wait for T0 to finish. However, in reality that is likely suboptimal from a system perspective if there are other cores that could allow T0 and T1 to run concurrently. Since T1 can not migrate, the only choice for higher concurrency is to try to move T0. This is not something we addessed in the recent rt-balancing re-work. This patch tries to enhance the balancing algorithm by accomodating this scenario. It accomplishes this by incorporating the migratability of a task into its priority calculation. Within a numerical tsk->prio, a non-migratable task is logically higher than a migratable one. We maintain this by introducing a new per-priority queue (xqueue, or exclusive-queue) for holding non-migratable tasks. The scheduler will draw from the xqueue over the standard shared-queue (squeue) when available. There are several details for utilizing this properly. 1) During task-wake-up, we not only need to check if the priority preempts the current task, but we also need to check for this non-migratable condition. Therefore, if a non-migratable task wakes up and sees an equal priority migratable task already running, it will attempt to preempt it *if* there is a likelyhood that the current task will find an immediate home. 2) Tasks only get this non-migratable "priority boost" on wake-up. Any requeuing will result in the non-migratable task being queued to the end of the shared queue. This is an attempt to prevent the system from being completely unfair to migratable tasks during things like SCHED_RR timeslicing. I am sure this patch introduces potentially "odd" behavior if you concoct a scenario where a bunch of non-migratable threads could starve migratable ones given the right pattern. I am not yet convinced that this is a problem since we are talking about tasks of equal RT priority anyway, and there never is much in the way of guarantees against starvation under that scenario anyway. (e.g. you could come up with a similar scenario with a specific timing environment verses an affinity environment). I can be convinced otherwise, but for now I think this is "ok". Signed-off-by: Gregory Haskins CC: Dmitry Adamushko CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 +++-- kernel/sched_rt.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed17..7178b8c2351 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -151,7 +151,8 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; + struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ + struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ }; struct rt_bandwidth { @@ -7542,7 +7543,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); + INIT_LIST_HEAD(array->xqueue + i); + INIT_LIST_HEAD(array->squeue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d573205..fefed39fafd 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) if (group_rq && rt_rq_throttled(group_rq)) return; - list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (rt_se->nr_cpus_allowed == 1) + list_add_tail(&rt_se->run_list, + array->xqueue + rt_se_prio(rt_se)); + else + list_add_tail(&rt_se->run_list, + array->squeue + rt_se_prio(rt_se)); + __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) + if (list_empty(array->squeue + rt_se_prio(rt_se)) + && list_empty(array->xqueue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. + * + * Note: We always enqueue the task to the shared-queue, regardless of its + * previous position w.r.t. exclusive vs shared. This is so that exclusive RR + * tasks fairly round-robin with all tasks on the runqueue, not just other + * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + list_del_init(&rt_se->run_list); + list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq); + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio) { resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if((p->prio == rq->curr->prio) + && p->rt.nr_cpus_allowed == 1 + && rq->curr->rt.nr_cpus_allowed != 1 + && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + cpumask_t mask; + + if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) + /* + * There appears to be other cpus that can accept + * current, so lets reschedule to try and push it away + */ + resched_task(rq->curr); + } +#endif } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); + queue = array->xqueue + idx; + if (!list_empty(queue)) + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + else { + queue = array->squeue + idx; + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + } return next; } @@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { + list_for_each_entry(rt_se, array->squeue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); + + if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) + /* + * If either the new or old weight is a "1", we need + * to requeue to properly move between shared and + * exclusive queues. + */ + requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; -- cgit v1.2.3 From f333fdc9098b71e2687e4e9b6349fcb352960d66 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 12 May 2008 21:20:55 +0200 Subject: sched: make !hrtick faster it is safe to ignore timers and flags when the feature is disabled. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7178b8c2351..aa960b84b88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; - int cpu; + int cpu, hrtick = sched_feat(HRTICK); need_resched: preempt_disable(); @@ -4149,7 +4149,8 @@ need_resched_nonpreemptible: schedule_debug(prev); - hrtick_clear(rq); + if (hrtick) + hrtick_clear(rq); /* * Do the rq-clock update outside the rq lock: @@ -4197,7 +4198,8 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - hrtick_set(rq); + if (hrtick) + hrtick_set(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -- cgit v1.2.3 From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:01 +0200 Subject: sched: use a 2-d bitmap for searching lowest-pri CPU The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/sched.c | 7 ++ kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_cpupri.h | 36 +++++++++++ kernel/sched_rt.c | 98 ++++++---------------------- 5 files changed, 239 insertions(+), 77 deletions(-) create mode 100644 kernel/sched_cpupri.c create mode 100644 kernel/sched_cpupri.h (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..ecdd2d33563 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/sched.c b/kernel/sched.c index aa960b84b88..8a1257b6556 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -74,6 +74,8 @@ #include #include +#include "sched_cpupri.h" + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -450,6 +452,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd) cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 00000000000..52154fefab7 --- /dev/null +++ b/kernel/sched_cpupri.c @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 00000000000..0b6a3d110fa --- /dev/null +++ b/kernel/sched_cpupri.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fefed39fafd..44b06d75416 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) + if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { @@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { +#ifdef CONFIG_SMP + int highest_prio = rt_rq->highest_prio; +#endif + WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rq->rt.rt_nr_migratory--; } + if (rt_rq->highest_prio != highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + } + update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED @@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); -static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) -{ - int lowest_prio = -1; - int lowest_cpu = -1; - int count = 0; - int cpu; - - cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); - - /* - * Scan each rq for the lowest prio. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - struct rq *rq = cpu_rq(cpu); - - /* We look for lowest RT prio or non-rt CPU */ - if (rq->rt.highest_prio >= MAX_RT_PRIO) { - /* - * if we already found a low RT queue - * and now we found this non-rt queue - * clear the mask and set our bit. - * Otherwise just return the queue as is - * and the count==1 will cause the algorithm - * to use the first bit found. - */ - if (lowest_cpu != -1) { - cpus_clear(*lowest_mask); - cpu_set(rq->cpu, *lowest_mask); - } - return 1; - } - - /* no locking for now */ - if ((rq->rt.highest_prio > task->prio) - && (rq->rt.highest_prio >= lowest_prio)) { - if (rq->rt.highest_prio > lowest_prio) { - /* new low - clear old data */ - lowest_prio = rq->rt.highest_prio; - lowest_cpu = cpu; - count = 0; - } - count++; - } else - cpu_clear(cpu, *lowest_mask); - } - - /* - * Clear out all the set bits that represent - * runqueues that were of higher prio than - * the lowest_prio. - */ - if (lowest_cpu > 0) { - /* - * Perhaps we could add another cpumask op to - * zero out bits. Like cpu_zero_bits(cpumask, nrbits); - * Then that could be optimized to use memset and such. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - if (cpu >= lowest_cpu) - break; - cpu_clear(cpu, *lowest_mask); - } - } - - return count; -} - static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { int first; @@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - int count = find_lowest_cpus(task, lowest_mask); - if (!count) - return -1; /* No targets found */ + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ - /* - * There is no sense in performing an optimal search if only one - * target is found. - */ - if (count == 1) - return first_cpu(*lowest_mask); + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ /* * At this point we have built a mask of cpus representing the @@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } /* Assumes rq->lock is held */ @@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } /* -- cgit v1.2.3 From 6d299f1b53b84e2665f402d9bcc494800aba6386 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:14 +0200 Subject: sched: fix SCHED_OTHER balance iterator to include all tasks The currently logic inadvertently skips the last task on the run-queue, resulting in missed balance opportunities. Signed-off-by: Gregory Haskins Signed-off-by: David Bahi CC: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched_fair.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d..1fe4c65a817 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - if (next == &cfs_rq->tasks) - return NULL; - - /* Skip over entities that are not tasks */ - do { + while (next != &cfs_rq->tasks) { se = list_entry(next, struct sched_entity, group_node); next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) - return NULL; + /* Skip over entities that are not tasks */ + if (entity_is_task(se)) { + p = task_of(se); + break; + } + } cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); - return p; } -- cgit v1.2.3 From d07355f5def74d060333563b36ab51b89fd44cdd Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 12 May 2008 21:21:15 +0200 Subject: sched: check for SD_SERIALIZE atomically in rebalance_domains() Nothing really serious here, mainly just a matter of nit-picking :-/ From: Dmitry Adamushko For CONFIG_SCHED_DEBUG && CONFIG_SYSCT configs, sd->flags can be altered while being manipulated in rebalance_domains(). Let's do an atomic check. We rely here on the atomicity of read/write accesses for aligned words. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8a1257b6556..90329f1f894 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3668,6 +3668,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + int need_serialize; cpumask_t tmp; for_each_domain(cpu, sd) { @@ -3685,8 +3686,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; + need_serialize = sd->flags & SD_SERIALIZE; - if (sd->flags & SD_SERIALIZE) { + if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } @@ -3702,7 +3704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } sd->last_balance = jiffies; } - if (sd->flags & SD_SERIALIZE) + if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { -- cgit v1.2.3 From f7dcd80bbc8e7032443e6539ea1b830364f82200 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:20:38 +0200 Subject: namespacecheck: fixes in kernel/sched.c Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 90329f1f894..02a5eeedcb9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1633,7 +1633,7 @@ inline int task_curr(const struct task_struct *p) } /* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(const int cpu) { return cpu_rq(cpu)->load.weight; } -- cgit v1.2.3 From 81d41d7ece23a1c3b4bcd1604026d3a06cc4dc79 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 11 May 2008 05:55:33 +0530 Subject: sched: fix defined-but-unused warning Fix this warning, which appears with !CONFIG_SMP: kernel/sched.c:1216: warning: `init_hrtick' defined but not used Signed-off-by: Rabin Vincent Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 02a5eeedcb9..f3faec52c5a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1130,6 +1130,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1185,6 +1186,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { -- cgit v1.2.3 From e21f5b153b9b4a6775d7d41964e372e13a9178ab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 23 May 2008 09:05:58 -0700 Subject: sched: print module list in the "scheduling while atomic" warning For the normal WARN_ON() etc we added a print-the-modules-list already, which is very useful to figure out candidates for certain types of bugs. This patch adds the same print to the "scheduling while atomic" BUG warning, for the same reason: when we get here it's very useful to see which modules are loaded, to narrow down the candidate code list. Signed-off-by: Arjan van de Ven Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f3faec52c5a..84a360670b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4070,6 +4070,7 @@ static noinline void __schedule_bug(struct task_struct *prev) prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); + print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -- cgit v1.2.3 From 6d6bc0ad867c46896d0994bb039e7550ecb9b51d Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 30 May 2008 14:23:45 +0200 Subject: sched: add comments for ifdefs in sched.c make sched.c easier to read. Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 76 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 84a360670b9..ef4e25604bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,15 +292,15 @@ struct task_group root_task_group; static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif -#else +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ #define root_task_group init_task_group -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -310,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else +#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif +#endif /* CONFIG_USER_SCHED */ /* * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. @@ -1316,15 +1316,15 @@ void wake_up_idle_cpu(int cpu) if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } -#endif +#endif /* CONFIG_NO_HZ */ -#else +#else /* !CONFIG_SMP */ static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); set_tsk_thread_flag(p, tif_bit); } -#endif +#endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) @@ -2129,7 +2129,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } } } -#endif +#endif /* CONFIG_SCHEDSTATS */ out_activate: #endif /* CONFIG_SMP */ @@ -2329,7 +2329,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } -#else +#else /* !CONFIG_PREEMPT_NOTIFIERS */ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -2341,7 +2341,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif +#endif /* CONFIG_PREEMPT_NOTIFIERS */ /** * prepare_task_switch - prepare to switch tasks @@ -6300,9 +6300,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } kfree(groupmask); } -#else +#else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -#endif +#endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) { @@ -6598,7 +6598,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) cpus_or(*span, *span, *nodemask); } } -#endif +#endif /* CONFIG_NUMA */ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; @@ -6617,7 +6617,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } -#endif +#endif /* CONFIG_SCHED_SMT */ /* * multi-core sched-domains: @@ -6625,7 +6625,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_core); -#endif +#endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int @@ -6727,7 +6727,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ @@ -6764,11 +6764,11 @@ next_sg: sched_group_nodes_bycpu[cpu] = NULL; } } -#else +#else /* !CONFIG_NUMA */ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } -#endif +#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -7459,7 +7459,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) #endif return err; } -#endif +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ /* * Force a reinitialization of the sched domains hierarchy. The domains @@ -7677,8 +7677,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -7692,8 +7692,8 @@ void __init sched_init(void) root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ } #ifdef CONFIG_SMP @@ -7709,8 +7709,8 @@ void __init sched_init(void) #ifdef CONFIG_USER_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), RUNTIME_INF); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); @@ -7720,8 +7720,8 @@ void __init sched_init(void) INIT_LIST_HEAD(&root_task_group.children); init_task_group.parent = &root_task_group; list_add(&init_task_group.siblings, &root_task_group.children); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ for_each_possible_cpu(i) { struct rq *rq; @@ -8040,7 +8040,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); } -#else +#else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } @@ -8058,7 +8058,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static void free_rt_sched_group(struct task_group *tg) @@ -8129,7 +8129,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { } @@ -8147,7 +8147,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) @@ -8258,7 +8258,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } -#endif +#endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -8508,7 +8508,7 @@ static int sched_rt_global_constraints(void) return ret; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { unsigned long flags; @@ -8526,7 +8526,7 @@ static int sched_rt_global_constraints(void) return 0; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, @@ -8634,7 +8634,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, @@ -8658,7 +8658,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) { return sched_group_rt_period(cgroup_tg(cgrp)); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From 099f98c8a1f13501a98afbfff4756395a610581c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 29 May 2008 20:56:32 +0530 Subject: sched: print the sd->level in sched_domain_debug code While printing out the visual representation of the sched-domains, print the level (MC, SMT, CPU, NODE, ... ) of each of the sched_domains. Credit: Peter Zijlstra Signed-off-by: Gautham R Shenoy Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ef4e25604bb..dc0be113f41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6197,6 +6197,28 @@ void __init migration_init(void) #ifdef CONFIG_SCHED_DEBUG +static inline const char *sd_level_to_string(enum sched_domain_level lvl) +{ + switch (lvl) { + case SD_LV_NONE: + return "NONE"; + case SD_LV_SIBLING: + return "SIBLING"; + case SD_LV_MC: + return "MC"; + case SD_LV_CPU: + return "CPU"; + case SD_LV_NODE: + return "NODE"; + case SD_LV_ALLNODES: + return "ALLNODES"; + case SD_LV_MAX: + return "MAX"; + + } + return "MAX"; +} + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6216,7 +6238,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s\n", str); + printk(KERN_CONT "span %s level %s\n", + str, sd_level_to_string(sd->level)); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " -- cgit v1.2.3 From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:05 -0400 Subject: sched: fix cpupri hotplug support The RT folks over at RedHat found an issue w.r.t. hotplug support which was traced to problems with the cpupri infrastructure in the scheduler: https://bugzilla.redhat.com/show_bug.cgi?id=449676 This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel. This patch applies to 25.4-rt4, though it should trivially apply to most cpupri enabled kernels mentioned above. It turned out that the issue was that offline cpus could get inadvertently registered with cpupri so that they were erroneously selected during migration decisions. The end result would be an OOPS as the offline cpu had tasks routed to it. This patch generalizes the old join/leave domain interface into an online/offline interface, and adjusts the root-domain/hotplug code to utilize it. I was able to easily reproduce the issue prior to this patch, and am no longer able to reproduce it after this patch. I can offline cpus indefinately and everything seems to be in working order. Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point me in the right direction. Also thank you to Peter for reviewing the early iterations of this patch. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++-------------- kernel/sched_rt.c | 24 ++++++++++++++++++------ 2 files changed, 58 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dc0be113f41..f0ed81b7128 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -529,6 +529,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) static inline void inc_load(struct rq *rq, const struct task_struct *p) { @@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void) } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); @@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -7824,6 +7849,7 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 44b06d75416..e4821593d4d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) static inline void rt_set_overload(struct rq *rq) { + if (!rq->online) + return; + cpu_set(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set @@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) static inline void rt_clear_overload(struct rq *rq) { + if (!rq->online) + return; + /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); cpu_clear(rq->cpu, rq->rd->rto_mask); @@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_se_prio(rt_se)); } #endif #ifdef CONFIG_SMP @@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_rq->highest_prio != highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_rq->highest_prio); } update_rt_migration(rq_of_rt_rq(rt_rq)); @@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, } /* Assumes rq->lock is held */ -static void join_domain_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void leave_domain_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); @@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = { .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, - .join_domain = join_domain_rt, - .leave_domain = leave_domain_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, -- cgit v1.2.3 From 709d4b0c60f990bccf3e10ba7c6da407ad65c97f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:10 -0400 Subject: sched: fix cpupri priocount A rounding error was pointed out by Peter Zijlstra which would result in the structure holding priorities to be off by one. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 0b6a3d110fa..6b38355e267 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -4,7 +4,7 @@ #include #define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG +#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 -- cgit v1.2.3 From e539d8fcd11af811db70707d47ea436d5621d0da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2008 10:28:00 +0200 Subject: sched: fix the cpuprio count really Peter pointed out that the last version of the "fix" was still one off under certain circumstances. Use BITS_TO_LONG instead to get an accurate result. Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 6b38355e267..f25811b0f93 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -3,8 +3,8 @@ #include -#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 -- cgit v1.2.3 From 1100ac91b6af02d8639d518fad5b434b1bf44ed6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Jun 2008 12:25:37 +0200 Subject: sched: fix cpuprio build bug this patch was not built on !SMP: kernel/sched_rt.c: In function 'inc_rt_tasks': kernel/sched_rt.c:404: error: 'struct rq' has no member named 'online' Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e4821593d4d..eaa606071d5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -399,16 +399,19 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - rt_rq->highest_prio = rt_se_prio(rt_se); + rt_rq->highest_prio = rt_se_prio(rt_se); +#ifdef CONFIG_SMP if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); +#endif } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { struct rq *rq = rq_of_rt_rq(rt_rq); + rq->rt.rt_nr_migratory++; } -- cgit v1.2.3 From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:01 -0700 Subject: sched: CPU hotplug events must not destroy scheduler domains created by the cpusets First issue is not related to the cpusets. We're simply leaking doms_cur. It's allocated in arch_init_sched_domains() which is called for every hotplug event. So we just keep reallocation doms_cur without freeing it. I introduced free_sched_domains() function that cleans things up. Second issue is that sched domains created by the cpusets are completely destroyed by the CPU hotplug events. For all CPU hotplug events scheduler attaches all CPUs to the NULL domain and then puts them all into the single domain thereby destroying domains created by the cpusets (partition_sched_domains). The solution is simple, when cpusets are enabled scheduler should not create default domain and instead let cpusets do that. Which is exactly what the patch does. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 6 ++++++ kernel/sched.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e32..6090d18b58a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/sched.c b/kernel/sched.c index f0ed81b7128..1ddb0a8c797 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7292,6 +7292,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) { } +/* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -7439,6 +7451,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -7524,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7542,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } -- cgit v1.2.3 From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:02 -0700 Subject: sched: Move cpu masks from kernel/sched.c into kernel/cpu.c kernel/cpu.c seems a more logical place for those maps since they do not really have much to do with the scheduler these days. kernel/cpu.c is now built for the UP kernel too, but it does not affect the size the kernel sections. $ size vmlinux before text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux after text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/Makefile | 4 ++-- kernel/cpu.c | 24 ++++++++++++++++++++++++ kernel/sched.c | 18 ------------------ 3 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ecdd2d33563..6c55301112e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ - exit.o itimer.o time.o softirq.o resource.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c72..b11f06dc149 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,28 @@ #include #include +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP + +/* + * Represents all cpu's that are currently online. + */ +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); + +#else /* CONFIG_SMP */ + /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -403,3 +425,5 @@ out: cpu_maps_update_done(); } #endif /* CONFIG_PM_SLEEP_SMP */ + +#endif /* CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ddb0a8c797..f36f549e574 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return sched_setaffinity(pid, &new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; -- cgit v1.2.3 From e958b3600484533ff801920290468adc8135f89d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Jun 2008 23:22:32 +0200 Subject: sched: move weighted_cpuload into #ifdef CONFIG_SMP section weighted_cpuload is only used on SMP. move it into the CONFIG_SMP section. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f36f549e574..727bdef7616 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1637,12 +1637,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); @@ -1671,6 +1665,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + /* * Is this task likely cache-hot: */ -- cgit v1.2.3 From 37340746a66e5e7feed5945f28cb75d90a8fd9f6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 5 Jun 2008 22:46:32 -0700 Subject: cpusets: fix bug when adding nonexistent cpu or mem Adding a nonexistent cpu to a cpuset will be omitted quietly. It should return -EINVAL. Example: (real_nr_cpus <= 4 < NR_CPUS or cpu#4 was just offline) # cat cpus 0-1 # /bin/echo 4 > cpus # /bin/echo $? 0 # cat cpus # The same occurs when add a nonexistent mem. This patch will fix this bug. And when *buf == "", the check is unneeded. Signed-off-by: Lai Jiangshan Acked-by: Paul Jackson Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e32..039baa4cd90 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -797,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) return retval; + + if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + return -EINVAL; } - cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); retval = validate_change(cs, &trialcs); if (retval < 0) return retval; @@ -932,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; + + if (!nodes_subset(trialcs.mems_allowed, + node_states[N_HIGH_MEMORY])) + return -EINVAL; } - nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, - node_states[N_HIGH_MEMORY]); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ -- cgit v1.2.3 From 16882c1e962b4be5122fc05aaf2afc10fd9e2d15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 8 Jun 2008 21:20:41 +0400 Subject: sched: fix TASK_WAKEKILL vs SIGKILL race schedule() has the special "TASK_INTERRUPTIBLE && signal_pending()" case, this allows us to do current->state = TASK_INTERRUPTIBLE; schedule(); without fear to sleep with pending signal. However, the code like current->state = TASK_KILLABLE; schedule(); is not right, schedule() doesn't take TASK_WAKEKILL into account. This means that mutex_lock_killable(), wait_for_completion_killable(), down_killable(), schedule_timeout_killable() can miss SIGKILL (and btw the second SIGKILL has no effect). Introduce the new helper, signal_pending_state(), and change schedule() to use it. Hopefully it will have more users, that is why the task's state is passed separately. Note this "__TASK_STOPPED | __TASK_TRACED" check in signal_pending_state(). This is needed to preserve the current behaviour (ptrace_notify). I hope this check will be removed soon, but this (afaics good) change needs the separate discussion. The fast path is "(state & (INTERRUPTIBLE | WAKEKILL)) + signal_pending(p)", basically the same that schedule() does now. However, this patch of course bloats schedule(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed17..2c65bf29d13 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4159,12 +4159,10 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } -- cgit v1.2.3 From 6ad36762d7a88d747f6fed95194b4f7ff5da8df4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 8 Jun 2008 21:20:42 +0400 Subject: __mutex_lock_common: use signal_pending_state() Change __mutex_lock_common() to use signal_pending_state() for the sake of the code re-use. This adds 7 bytes to kernel/mutex.o, but afaics only because gcc isn't smart enough. (btw, uninlining of __mutex_lock_common() shrinks .text from 2722 to 1542, perhaps it is worth doing). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/mutex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index d046a345d36..bcdc9ac8ef6 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -165,10 +165,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * got a signal? (This code gets eliminated in the * TASK_UNINTERRUPTIBLE case.) */ - if (unlikely((state == TASK_INTERRUPTIBLE && - signal_pending(task)) || - (state == TASK_KILLABLE && - fatal_signal_pending(task)))) { + if (unlikely(signal_pending_state(state, task))) { mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, ip); -- cgit v1.2.3 From 0eb967012ea15e6e8cfab483d9fa37bc602d400c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:30 +0530 Subject: ftrace: prevent freeing of all failed updates Prevent freeing of records which cause problems and correspond to function from core kernel text. A new flag, FTRACE_FL_CONVERTED is used to mark a record as "converted". All other records are patched lazily to NOPs. Failed records now also remain on frace_hash table. Each invocation of ftrace_record_ip now checks whether the traced function has ever been recorded (including past failures) and doesn't re-record it again. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f762f5a2d33..ec54cb7d69d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -216,6 +216,12 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head_rcu(&node->node, &ftrace_hash[key]); } +/* called from kstop_machine */ +static inline void ftrace_del_hash(struct dyn_ftrace *node) +{ + hlist_del(&node->node); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { /* no locking, only called from kstop_machine */ @@ -332,12 +338,11 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(ftrace_caller)) #define MCOUNT_ADDR ((long)(mcount)) -static void +static int __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { unsigned long ip, fl; - int failed; ip = rec->ip; @@ -364,7 +369,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) - return; + return 0; /* * If it is enabled disable it, @@ -388,7 +393,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, */ fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); if (fl == FTRACE_FL_NOTRACE) - return; + return 0; new = ftrace_call_replace(ip, FTRACE_ADDR); } else @@ -396,34 +401,24 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if (enable) { if (rec->flags & FTRACE_FL_ENABLED) - return; + return 0; rec->flags |= FTRACE_FL_ENABLED; } else { if (!(rec->flags & FTRACE_FL_ENABLED)) - return; + return 0; rec->flags &= ~FTRACE_FL_ENABLED; } } - failed = ftrace_modify_code(ip, old, new); - if (failed) { - unsigned long key; - /* It is possible that the function hasn't been converted yet */ - key = hash_long(ip, FTRACE_HASHBITS); - if (!ftrace_ip_in_hash(ip, key)) { - rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); - } - - } + return ftrace_modify_code(ip, old, new); } static void ftrace_replace_code(int enable) { + int i, failed; unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - int i; if (enable) old = ftrace_nop_replace(); @@ -438,7 +433,15 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FAILED) continue; - __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, old, new, enable); + if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + rec->flags |= FTRACE_FL_FAILED; + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(rec->ip)) { + ftrace_del_hash(rec); + ftrace_free_rec(rec); + } + } } } } @@ -467,7 +470,6 @@ ftrace_code_disable(struct dyn_ftrace *rec) failed = ftrace_modify_code(ip, call, nop); if (failed) { rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); return 0; } return 1; @@ -621,8 +623,7 @@ unsigned long ftrace_update_tot_cnt; static int __ftrace_update_code(void *ignore) { struct dyn_ftrace *p; - struct hlist_head head; - struct hlist_node *t; + struct hlist_node *t, *n; int save_ftrace_enabled; cycle_t start, stop; int i; @@ -637,18 +638,33 @@ static int __ftrace_update_code(void *ignore) /* No locks needed, the machine is stopped! */ for (i = 0; i < FTRACE_HASHSIZE; i++) { - if (hlist_empty(&ftrace_hash[i])) - continue; + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) { + /* Skip over failed records which have not been + * freed. */ + if (p->flags & FTRACE_FL_FAILED) + continue; - head = ftrace_hash[i]; - INIT_HLIST_HEAD(&ftrace_hash[i]); + /* Unconverted records are always at the head of the + * hash bucket. Once we encounter a converted record, + * simply skip over to the next bucket. Saves ftraced + * some processor cycles (ftrace does its bid for + * global warming :-p ). */ + if (p->flags & (FTRACE_FL_CONVERTED)) + break; - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry(p, t, &head, node) { - if (ftrace_code_disable(p)) + if (ftrace_code_disable(p)) { + p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; - } + } else { + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(p->ip)) { + ftrace_del_hash(p); + ftrace_free_rec(p); + } + } + } } stop = ftrace_now(raw_smp_processor_id()); -- cgit v1.2.3 From 1d74f2a0f64b4091e5e91b55ac1b17dff93f4b59 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:42 +0530 Subject: ftrace: remove ftrace_ip_converted() Remove the unneeded function ftrace_ip_converted(). Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ec54cb7d69d..a8929e4c77c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -306,13 +306,6 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out_unlock; - /* - * There's a slight race that the ftraced will update the - * hash and reset here. If it is already converted, skip it. - */ - if (ftrace_ip_converted(ip)) - goto out_unlock; - node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; -- cgit v1.2.3 From eb9a7bf09172f409c10ec9560adeea95bb4045f5 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:54 +0530 Subject: ftrace: add debugfs entry 'failures' Identify functions which had their mcount call-site updates failed. This can help us track functions which ftrace shouldn't fiddle with, and are thus not being traced. If there is no race with any external agent which is modifying the mcount call-site, then this file displays no entries (normal case). Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a8929e4c77c..ad568c742bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -768,6 +768,7 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_CONT = (1 << 1), FTRACE_ITER_NOTRACE = (1 << 2), + FTRACE_ITER_FAILURES = (1 << 3), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -799,9 +800,16 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FAILED) || + if ((!(iter->flags & FTRACE_ITER_FAILURES) && + (rec->flags & FTRACE_FL_FAILED)) || + + ((iter->flags & FTRACE_ITER_FAILURES) && + (!(rec->flags & FTRACE_FL_FAILED) || + (rec->flags & FTRACE_FL_FREE))) || + ((iter->flags & FTRACE_ITER_FILTER) && !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; @@ -896,6 +904,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } +static int +ftrace_failures_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct ftrace_iterator *iter; + + ret = ftrace_avail_open(inode, file); + if (!ret) { + m = (struct seq_file *)file->private_data; + iter = (struct ftrace_iterator *)m->private; + iter->flags = FTRACE_ITER_FAILURES; + } + + return ret; +} + + static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; @@ -1309,6 +1335,13 @@ static struct file_operations ftrace_avail_fops = { .release = ftrace_avail_release, }; +static struct file_operations ftrace_failures_fops = { + .open = ftrace_failures_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + static struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = ftrace_regex_read, @@ -1386,6 +1419,11 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'available_filter_functions' entry\n"); + entry = debugfs_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); + if (!entry) + pr_warning("Could not create debugfs 'failures' entry\n"); + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); if (!entry) -- cgit v1.2.3 From 34078a5e44db3cbed2e0ed580c29a39d94e0cd97 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Tue, 3 Jun 2008 08:33:41 +0530 Subject: ftrace: prevent freeing of all failed updates Steven Rostedt wrote: > If we unload a module and reload it, will it ever get converted again? The intent was always to filter core kernel functions to prevent their freeing. Here's a fix which should allow re-recording of module call-sites. Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ad568c742bf..0118979e211 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -430,7 +430,7 @@ static void ftrace_replace_code(int enable) if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || - !kernel_text_address(rec->ip)) { + !core_kernel_text(rec->ip)) { ftrace_del_hash(rec); ftrace_free_rec(rec); } @@ -651,10 +651,9 @@ static int __ftrace_update_code(void *ignore) ftrace_update_cnt++; } else { if ((system_state == SYSTEM_BOOTING) || - !kernel_text_address(p->ip)) { + !core_kernel_text(p->ip)) { ftrace_del_hash(p); ftrace_free_rec(p); - } } } -- cgit v1.2.3 From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2008 14:49:58 +0200 Subject: sched: fix hotplug cpus on ia64 Cliff Wickman wrote: > I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1) > and get a very predictable hotplug cpu problem. > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > > The script that disables the cpu always hangs (unkillable) > on the 3rd attempt. > > And a bit further: > The kstopmachine thread always sits on the run queue (real time) for about > 30 minutes before running. this fix solves some (but not all) issues between CPU hotplug and RT bandwidth throttling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++-- kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 727bdef7616..e9c24a12865 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (int)(long)hcpu; + switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); free_sched_domains(); return NOTIFY_OK; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index eaa606071d5..8ae3416e0bb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + if (iter->rt_runtime == RUNTIME_INF) + goto next; + diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { do_div(diff, weight); @@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq) break; } } +next: spin_unlock(&iter->rt_runtime_lock); } spin_unlock(&rt_b->rt_runtime_lock); return more; } + +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + spin_unlock(&rt_rq->rt_runtime_lock); + + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; + } else { + iter->rt_runtime -= want; + want -= want; + } + spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + spin_lock(&rt_rq->rt_runtime_lock); + BUG_ON(want); +balanced: + rt_rq->rt_runtime = RUNTIME_INF; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) #ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - int more; - spin_unlock(&rt_rq->rt_runtime_lock); - more = balance_runtime(rt_rq); + balance_runtime(rt_rq); spin_lock(&rt_rq->rt_runtime_lock); - if (more) - runtime = sched_rt_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; } #endif @@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq) if (rq->rt.overloaded) rt_set_overload(rq); + __enable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } @@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq) if (rq->rt.overloaded) rt_clear_overload(rq); + __disable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } -- cgit v1.2.3 From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Jun 2008 12:57:11 -0700 Subject: sched: prevent bound kthreads from changing cpus_allowed Kthreads that have called kthread_bind() are bound to specific cpus, so other tasks should not be able to change their cpus_allowed from under them. Otherwise, it is possible to move kthreads, such as the migration or software watchdog threads, so they are not allowed access to the cpu they work on. Cc: Peter Zijlstra Cc: Paul Menage Cc: Paul Jackson Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 14 +++++++++++++- kernel/kthread.c | 1 + kernel/sched.c | 6 ++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6090d18b58a..b84354f4de3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; + if (tsk->flags & PF_THREAD_BOUND) { + cpumask_t mask; + + mutex_lock(&callback_mutex); + mask = cs->cpus_allowed; + mutex_unlock(&callback_mutex); + if (!cpus_equal(tsk->cpus_allowed, mask)) + return -EINVAL; + } return security_task_setscheduler(tsk, 0, NULL); } @@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); + if (err) + return; from = oldcs->mems_allowed; to = cs->mems_allowed; diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e..97747cdd37c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; + k->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index e9c24a12865..164fe7fe0d8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) goto out; } + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpus_equal(p->cpus_allowed, *new_mask))) { + ret = -EINVAL; + goto out; + } + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { -- cgit v1.2.3 From 6492c7f83e88a3a9521793b6934d882b97afe287 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 8 Jun 2008 09:27:13 +0200 Subject: sched: trivial sched_features cleanup Remove unused debug/tuning features. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1c7283cb958..62b39ca92eb 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(NORMALIZED_SLEEPER, 1) -SCHED_FEAT(DEADLINE, 1) -- cgit v1.2.3 From e9886ca3a93d7d041d3de8e5acebe213da777d59 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 9 Jun 2008 17:12:24 +0900 Subject: sched: kill off dead cfs_rq_set_shares() Building with CONFIG_FAIR_GROUP_SCHED=y on UP results in an unused cfs_rq_set_shares() reference. As nothing is using this dummy function in the first place, just kill it off. Signed-off-by: Paul Mundt Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 164fe7fe0d8..07d5472dee9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,16 +1480,8 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -#else /* CONFIG_SMP */ - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -} #endif -#endif /* CONFIG_SMP */ - #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" -- cgit v1.2.3 From 040ec23d07f95285e9777a85cda29cb339a3065b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jun 2008 01:45:29 -0700 Subject: sched: sched_clock() lockdep fix Sitsofe Wheeler bisected the following commit to cause a lockdep to warn about itself and turn itself off: > commit c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 > Author: Ingo Molnar > Date: Mon May 12 21:21:14 2008 +0200 > > sched: do not trace sched_clock do not use raw irq flags in cpu_clock() as it causes lockdep to lose track of the true state of the IRQ flag. Reported-and-bisected-by: Sitsofe Wheeler Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6590a828138..b8c9fe67622 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -889,7 +889,7 @@ unsigned long long notrace cpu_clock(int cpu) unsigned long long prev_cpu_time, time, delta_time; unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; @@ -898,7 +898,7 @@ unsigned long long notrace cpu_clock(int cpu) time = __sync_cpu_clock(time, cpu); per_cpu(prev_cpu_time, cpu) = time; } - raw_local_irq_restore(flags); + local_irq_restore(flags); return time; } -- cgit v1.2.3 From 2b1bce1787700768cbc87c8509851c6f49d252dc Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Mon, 9 Jun 2008 14:10:25 +0530 Subject: ftrace: disable tracing when current_tracer is set to "none" Found that inspite of setting the current_tracer to "none", trace from the previous trace type continued to be collected. The patch below fixes this and causes the trace to be disabled when the "none" type is selected. Compile and boot tested the patch for functionality. Signed-off-by: Ankita Garg Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 12f5e817380..dde6f0ace6d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,11 +43,6 @@ static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", -}; - static int trace_alloc_page(void); static int trace_free_page(void); @@ -135,6 +130,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +static notrace void no_trace_init(struct trace_array *tr) +{ + int cpu; + + if(tr->ctrl) + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); + tracer_enabled = 0; +} + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = { + .name = "none", + .init = no_trace_init +}; + + /** * trace_wake_up - wake up tasks waiting for trace input * -- cgit v1.2.3 From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 May 2008 23:00:01 +0200 Subject: Introduce new top level suspend and hibernation callbacks Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning 'extended') representing suspend and hibernation operations for bus types, device classes, device types and device drivers. Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops' objects, if defined, instead of the ->suspend(), ->resume(), ->suspend_late(), and ->resume_early() callbacks (the old callbacks will be considered as legacy and gradually phased out). The main purpose of doing this is to separate suspend (aka S2RAM and standby) callbacks from hibernation callbacks in such a way that the new callbacks won't take arguments and the semantics of each of them will be clearly specified. This has been requested for multiple times by many people, including Linus himself, and the reason is that within the current scheme if ->resume() is called, for example, it's difficult to say why it's been called (ie. is it a resume from RAM or from hibernation or a suspend/hibernation failure etc.?). The second purpose is to make the suspend/hibernation callbacks more flexible so that device drivers can handle more than they can within the current scheme. For example, some drivers may need to prevent new children of the device from being registered before their ->suspend() callbacks are executed or they may want to carry out some operations requiring the availability of some other devices, not directly bound via the parent-child relationship, in order to prepare for the execution of ->suspend(), etc. Ultimately, we'd like to stop using the freezing of tasks for suspend and therefore the drivers' suspend/hibernation code will have to take care of the handling of the user space during suspend/hibernation. That, in turn, would be difficult within the current scheme, without the new ->prepare() and ->complete() callbacks. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- kernel/power/disk.c | 22 +++++++++++++++------- kernel/power/main.c | 6 ++++-- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc65..d416be0efa8 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -193,6 +193,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +225,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Resume_console: resume_console(); Close: @@ -300,8 +304,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +334,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -403,6 +409,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +418,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,7 +427,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESTORE); Resume_console: resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524..d023b6b584e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESUME); Resume_console: resume_console(); Close: -- cgit v1.2.3 From 20764ff1efb440640353053ec83263e69e1259e0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 11:27:03 +0200 Subject: ftrace: fix printout Do not print loglevel before "entries of %ld bytes". Move it to the previous pr_info. Signed-off-by: Jiri Slaby Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dde6f0ace6d..6e9dae7eb41 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3051,9 +3051,8 @@ __init static int tracer_alloc_buffers(void) } max_tr.entries = global_trace.entries; - pr_info("tracer: %d pages allocated for %ld", - pages, trace_nr_entries); - pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", + pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); pr_info(" actual entries %ld\n", global_trace.entries); tracer_init_debugfs(); -- cgit v1.2.3 From 2e084786f6fe052274f1dfa7c675fe4a02cacd6e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 12 Jun 2008 16:42:58 +0800 Subject: sched: fair group: fix overflow(was: fix divide by zero) I found a bug which can be reproduced by this way:(linux-2.6.26-rc5, x86-64) (use 2^32, 2^33, ...., 2^63 as shares value) # mkdir /dev/cpuctl # mount -t cgroup -o cpu cpuctl /dev/cpuctl # cd /dev/cpuctl # mkdir sub # echo 0x8000000000000000 > sub/cpu.shares # echo $$ > sub/tasks oops here! divide by zero. This is because do_div() expects the 2th parameter to be 32 bits, but unsigned long is 64 bits in x86_64. Peter Zijstra pointed it out that the sane thing to do is limit the shares value to something smaller instead of using an even more expensive divide. Also, I found another bug about "the shares value is too large": pid1 and pid2 are set affinity to cpu#0 pid1 is attached to cg1 and pid2 is attached to cg2 if cg1/cpu.shares = 1024 cg2/cpu.shares = 2000000000 then pid2 got 100% usage of cpu, and pid1 0% if cg1/cpu.shares = 1024 cg2/cpu.shares = 20000000000 then pid2 got 0% usage of cpu, and pid1 100% And a weight of a cfs_rq is the sum of weights of which entities are queued on this cfs_rq, so the shares value should be limited to a smaller value. I think that (1UL << 18) is a good limited value: 1) it's not too large, we can create a lot of group before overflow 2) it's several times the weight value for nice=-19 (not too small) Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2c65bf29d13..6c1ecbdc0db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock); #endif /* - * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (ULONG_MAX - 1) +#define MAX_SHARES (1UL << 18) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif -- cgit v1.2.3 From 7a232e0350940d2664f4de5cc3f0f443bae5062d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 12 Jun 2008 16:43:07 +0800 Subject: sched: 64-bit: fix arithmetics overflow (overflow means weight >= 2^32 here, because inv_weigh = 2^32/weight) A weight of a cfs_rq is the sum of weights of which entities are queued on this cfs_rq, so it will overflow when there are too many entities. Although, overflow occurs very rarely, but it break fairness when it occurs. 64-bits systems have more memory than 32-bit systems and 64-bit systems can create more process usually, so overflow may occur more frequently. This patch guarantees fairness when overflow happens on 64-bit systems. Thanks to the optimization of compiler, it changes nothing on 32-bit. Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c1ecbdc0db..eaf6751e761 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1340,8 +1340,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (!lw->inv_weight) - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* -- cgit v1.2.3 From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Jun 2008 23:24:06 +0200 Subject: Suspend-related patches for 2.6.27 ACPI PM: Add possibility to change suspend sequence There are some systems out there that don't work correctly with our current suspend/hibernation code ordering. Provide a workaround for these systems allowing them to pass 'acpi_sleep=old_ordering' in the kernel command line so that it will use the pre-ACPI 2.0 ("old") suspend code ordering. Unfortunately, this requires us to add a platform hook to the resuming of devices for recovering the platform in case one of the device drivers' .suspend() routines returns error code. Namely, ACPI 1.0 specifies that _PTS should be called before suspending devices, but _WAK still should be called before resuming them in order to undo the changes made by _PTS. However, if there is an error during suspending devices, they are automatically resumed without returning control to the PM core, so the _WAK has to be called from within device_resume() in that cases. The patch also reorders and refactors the ACPI suspend/hibernation code to avoid duplication as far as reasonably possible. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- kernel/power/disk.c | 28 ++++++++++++++++++++++------ kernel/power/main.c | 10 +++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d416be0efa8..f011e0870b5 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode) hibernation_ops->restore_cleanup(); } +/** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + /** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control @@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode) suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) - goto Resume_console; + goto Recover_platform; if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) @@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Resume_console: resume_console(); Close: platform_end(platform_mode); return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; } /** @@ -398,8 +412,11 @@ int hibernation_platform_enter(void) suspend_console(); error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } error = hibernation_ops->prepare(); if (error) @@ -428,7 +445,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - Resume_console: resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index d023b6b584e..3398f4651aa 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state) error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; + goto Recover_platform; } if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; if (suspend_ops->prepare) { error = suspend_ops->prepare(); @@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_ops->finish(); Resume_devices: device_resume(PMSG_RESUME); - Resume_console: resume_console(); Close: if (suspend_ops->end) suspend_ops->end(); return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; } /** -- cgit v1.2.3 From 67dddaad5d8b8c5ee5b96a7e2f6cb0faad703865 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 12 Jun 2008 15:21:35 -0700 Subject: kprobes: fix error checking of batch registration Fix error checking routine to catch an error which occurs in first __register_*probe(). Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: David Miller Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1e0250cb948..d4998f81e22 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -699,8 +699,9 @@ static int __register_kprobes(struct kprobe **kps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kprobe(kps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kprobes(kps, i); + if (ret < 0) { + if (i > 0) + unregister_kprobes(kps, i); break; } } @@ -776,8 +777,9 @@ static int __register_jprobes(struct jprobe **jps, int num, jp->kp.break_handler = longjmp_break_handler; ret = __register_kprobe(&jp->kp, called_from); } - if (ret < 0 && i > 0) { - unregister_jprobes(jps, i); + if (ret < 0) { + if (i > 0) + unregister_jprobes(jps, i); break; } } @@ -920,8 +922,9 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return -EINVAL; for (i = 0; i < num; i++) { ret = __register_kretprobe(rps[i], called_from); - if (ret < 0 && i > 0) { - unregister_kretprobes(rps, i); + if (ret < 0) { + if (i > 0) + unregister_kretprobes(rps, i); break; } } -- cgit v1.2.3 From 2429e4ee78e2fa40f82a4572dd21d4f3b4de9325 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Fri, 13 Jun 2008 14:40:17 +0800 Subject: lockdep: output lock_class key instead of address for forward dependency output The key instead of address of lock_class should be output in /proc/lockdep when forward dependency is output, because key is output for lock_class itself as identifier too. This patch is based on x86/auto-latest branch of git-x86 tree, and has been tested on x86_64 platform. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index dc5d29648d8..688c5f1940b 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -139,7 +139,7 @@ static int l_show(struct seq_file *m, void *v) list_for_each_entry(entry, &class->locks_after, entry) { if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class); + seq_printf(m, " -> [%p] ", entry->class->key); print_name(m, entry->class); seq_puts(m, "\n"); } -- cgit v1.2.3 From a5a242dceed5d1c74fe46088762a9e4312c2d000 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 13 Jun 2008 11:00:14 +0200 Subject: stacktrace: print_stack_trace() cleanup - shorter code and better atomicity with regards to printk(). (It's been tested with the backtrace self-test code on i386 and x86_64.) Cc: Arjan van de Ven Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 0914d0cbc83..7eaea9d02a5 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -11,17 +11,14 @@ void print_stack_trace(struct stack_trace *trace, int spaces) { - int i, j; + int i; if (WARN_ON(!trace->entries)) return; for (i = 0; i < trace->nr_entries; i++) { - unsigned long ip = trace->entries[i]; - - for (j = 0; j < spaces + 1; j++) - printk(" "); - print_ip_sym(ip); + printk("%*c", 1 + spaces, ' '); + print_ip_sym(trace->entries[i]); } } -- cgit v1.2.3 From a4500b84c51645bbc86be3ca84f2252b7ada060f Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 14 Jun 2008 11:59:39 +0530 Subject: ftrace: fix "notrace" filtering priority This is a fix to give notrace filter rules priority over "set_ftrace_filter" rules. This fix ensures that functions which are set to be filtered and are concurrently marked as "notrace" don't get recorded. As of now, if a record is marked as FTRACE_FL_FILTER and is enabled, then the notrace flag is not checked. Tested on x86-32. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0118979e211..b532e4a68c7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -355,20 +355,26 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * If this record is set not to trace then * do nothing. * + * If this record is set not to trace and + * it is enabled then disable it. + * * If this record is not set to be filtered and * it is enabled, disable it. */ - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | + FTRACE_FL_ENABLED); if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) + (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || + !fl || (fl == FTRACE_FL_NOTRACE)) return 0; /* * If it is enabled disable it, * otherwise enable it! */ - if (fl == FTRACE_FL_ENABLED) { + if (fl & FTRACE_FL_ENABLED) { /* swap new and old */ new = old; old = ftrace_call_replace(ip, FTRACE_ADDR); -- cgit v1.2.3 From 87c8a64475f0597b7fd9c36d2f867ae8ef4a9eca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Jun 2008 13:19:08 +0200 Subject: printk: export console_drivers this symbol is needed by drivers/video/xen-fbfront.ko. Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 475fc22a285..70cfa5ac75c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress); static DECLARE_MUTEX(console_sem); static DECLARE_MUTEX(secondary_console_sem); struct console *console_drivers; +EXPORT_SYMBOL_GPL(console_drivers); + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's -- cgit v1.2.3 From 906d882cacecd37ad2fdd03ed2a9b232bcb9507e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 9 Jun 2008 16:35:25 -0700 Subject: rcu: remove unused field struct rcu_data::rcu_tasklet Since softirq works for rcu reclaimer, rcu_tasklet is unused now. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index b8e4cdae4e8..396b121edfe 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -82,7 +82,6 @@ struct rcu_data { spinlock_t lock; /* Protect rcu_data fields. */ long completed; /* Number of last completed batch. */ int waitlistcount; - struct tasklet_struct rcu_tasklet; struct rcu_head *nextlist; struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; -- cgit v1.2.3 From f22529351f7060d61eff3b76d7c9706f90aaedf3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 May 2008 10:37:48 +0200 Subject: namespacecheck: fixes Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b532e4a68c7..0d5bcf69952 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,7 +50,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c16935d3bc5..93a66200915 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -196,7 +196,7 @@ static void tracing_sched_unregister(void) &ctx_trace); } -void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(void) { long ref; @@ -205,7 +205,7 @@ void tracing_start_sched_switch(void) tracing_sched_register(); } -void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(void) { long ref; -- cgit v1.2.3 From 95e904c7da715aa2dbfb595da66b63de37a0bb04 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 11 May 2008 05:55:33 +0530 Subject: sched: fix defined-but-unused warning Fix this warning, which appears with !CONFIG_SMP: kernel/sched.c:1216: warning: `init_hrtick' defined but not used Signed-off-by: Rabin Vincent Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index eaf6751e761..04228524d16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1127,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1182,6 +1183,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { -- cgit v1.2.3 From 5af970a48f3ba0dd96a036b196c79dc923f28231 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Jun 2008 10:09:48 +0200 Subject: rcutorture: WARN_ON_ONCE(1) when detecting an error this makes it easier for automated tests to pick up such failures. --- kernel/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0334b6a8bac..0ca7e9b290b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -687,6 +687,7 @@ rcu_torture_printk(char *page) if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); } cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) -- cgit v1.2.3 From 20b6331bfed1f07ba1e5006889a5d64adc53615e Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 11 Jun 2008 00:58:30 +0200 Subject: sched: rework of "prioritize non-migratable tasks over migratable ones" regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f I think we can do it simpler. Please take a look at the patch below. Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and twice so on x86_64), let's add "exclusive" (the ones that are bound to this CPU) tasks to the head of the queue and "shared" ones -- to the end. In case of a few newly woken up "exclusive" tasks, they are 'stacked' (not queued as now), meaning that a task {i+1} is being placed in front of the previously woken up task {i}. But I don't think that this behavior may cause any realistic problems. There are a couple of changes on top of this one. (1) in check_preempt_curr_rt() I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt" check. enqueue_task_rt(p) and check_preempt_curr_rt() are always called one after another with rq->lock being held so the following check "p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should be enough (well, just its left part) to guarantee that 'p' has been queued in front of the 'curr'. (2) in set_cpus_allowed_rt() I don't thinks there is a need for requeue_task_rt() here. Perhaps, the only case when 'requeue' (+ reschedule) might be useful is as follows: i) weight == 1 && cpu_isset(task_cpu(p), *new_mask) i.e. a task is being bound to this CPU); ii) 'p' != rq->curr but here, 'p' has already been on this CPU for a while and was not migrated. i.e. it's possible that 'rq->curr' would not have high chances to be migrated right at this particular moment (although, has chance in a bit longer term), should we allow it to be preempted. Anyway, I think we should not perhaps make it more complex trying to address some rare corner cases. For instance, that's why a single queue approach would be preferable. Unless I'm missing something obvious, this approach gives us similar functionality at lower cost. Verified only compilation-wise. (Almost)-Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- kernel/sched_rt.c | 44 +++++++++----------------------------------- 2 files changed, 11 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 554de400980..cc1d558406f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -153,8 +153,7 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ - struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ + struct list_head queue[MAX_RT_PRIO]; }; struct rt_bandwidth { @@ -7620,8 +7619,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->xqueue + i); - INIT_LIST_HEAD(array->squeue + i); + INIT_LIST_HEAD(array->queue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ae3416e0bb..f721b52acd8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -576,16 +576,15 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); + struct list_head *queue = array->queue + rt_se_prio(rt_se); if (group_rq && rt_rq_throttled(group_rq)) return; if (rt_se->nr_cpus_allowed == 1) - list_add_tail(&rt_se->run_list, - array->xqueue + rt_se_prio(rt_se)); + list_add(&rt_se->run_list, queue); else - list_add_tail(&rt_se->run_list, - array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); @@ -598,8 +597,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->squeue + rt_se_prio(rt_se)) - && list_empty(array->xqueue + rt_se_prio(rt_se))) + if (list_empty(array->queue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -666,11 +664,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. - * - * Note: We always enqueue the task to the shared-queue, regardless of its - * previous position w.r.t. exclusive vs shared. This is so that exclusive RR - * tasks fairly round-robin with all tasks on the runqueue, not just other - * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) @@ -678,7 +671,7 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -736,9 +729,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq); - /* * Preempt the current task with a newly woken task if needed: */ @@ -764,8 +754,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) */ if((p->prio == rq->curr->prio) && p->rt.nr_cpus_allowed == 1 - && rq->curr->rt.nr_cpus_allowed != 1 - && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + && rq->curr->rt.nr_cpus_allowed != 1) { cpumask_t mask; if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) @@ -789,15 +778,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->xqueue + idx; - if (!list_empty(queue)) - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - else { - queue = array->squeue + idx; - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - } + queue = array->queue + idx; + next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; } @@ -867,7 +849,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->squeue + idx, run_list) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1249,14 +1231,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); - - if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) - /* - * If either the new or old weight is a "1", we need - * to requeue to properly move between shared and - * exclusive queues. - */ - requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; -- cgit v1.2.3 From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 12 May 2008 15:44:40 +0200 Subject: x86, generic: mark early_printk as asmlinkage It's not explicitly marked as asmlinkage, but invoked from x86_32 startup code with parameters on stack. No other architectures define early_printk and none of them are affected by this change, since defines asmlinkage as empty token. Signed-off-by: Jiri Slaby Cc: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 70cfa5ac75c..de1a4f4470c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -38,7 +38,7 @@ /* * Architectures can override it: */ -void __attribute__((weak)) early_printk(const char *fmt, ...) +void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) { } -- cgit v1.2.3 From 4620b49f76096fa5183eecad7d689faa898a4c82 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 12 Jun 2008 23:21:53 +0200 Subject: softirq: remove initialization of static per-cpu variable Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/softirq.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 059256874e9..86775340ef1 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -359,10 +359,8 @@ struct tasklet_head struct tasklet_struct **tail; }; -/* Some compilers disobey section attribute on statics when not - initialized -- RR */ -static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; -static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; +static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); +static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); void __tasklet_schedule(struct tasklet_struct *t) { -- cgit v1.2.3 From d120f65f3aaf306c957bc4c82e510f5b0f1e9b27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jun 2008 05:21:44 -0700 Subject: rcu: make rcutorture more vicious: add stutter feature This patch takes a step towards making rcutorture more brutal by allowing the test to be automatically periodically paused, with the default being to run the test for five seconds then pause for five seconds and repeat. This behavior can be controlled using a new "stutter" module parameter, so that "stutter=0" gives the old default behavior of running continuously. Starting and stopping rcutorture more heavily stresses RCU's interaction with the scheduler, as well as exercising more paths through the grace-period detection code. Note that the default to "shuffle_interval" has also been adjusted from 5 seconds to 3 seconds to provide varying overlap with the "stutter" interval. I am still unable to provoke the failures that Alexey has been seeing, even with this patch, but will be doing a few additional things to beef up rcutorture. Suggested-by: Ingo Molnar Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0ca7e9b290b..98ae7d16822 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -57,7 +57,8 @@ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ +static int stutter = 5; /* Start/stop testing interval (in sec) */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -72,6 +73,8 @@ module_param(test_no_idle_hz, bool, 0444); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0444); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -91,6 +94,7 @@ static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -119,6 +123,8 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static struct list_head rcu_torture_removed; +static int stutter_pause_test = 0; + /* * Allocate an element from the rcu_tortures pool. */ @@ -179,6 +185,13 @@ rcu_random(struct rcu_random_state *rrsp) return swahw32(rrsp->rrs_state); } +static void +rcu_stutter_wait(void) +{ + while (stutter_pause_test) + schedule_timeout_interruptible(1); +} + /* * Operations vector for selecting different types of tests. */ @@ -563,6 +576,7 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -586,6 +600,7 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); @@ -641,6 +656,7 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); while (!kthread_should_stop()) @@ -812,15 +828,34 @@ rcu_torture_shuffle(void *arg) return 0; } +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + static inline void rcu_torture_print_module_parms(char *tag) { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", + "shuffle_interval=%d stutter=%d\n", torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter); } static void @@ -829,6 +864,11 @@ rcu_torture_cleanup(void) int i; fullstop = 1; + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); @@ -1017,6 +1057,19 @@ rcu_torture_init(void) goto unwind; } } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } return 0; unwind: -- cgit v1.2.3 From 49307fd6f72bdd68cc2bd23e7da0bcfecf8087c9 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Wed, 18 Jun 2008 09:18:38 +0200 Subject: sched: NULL pointer dereference while setting sched_rt_period_us MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_RT_GROUP_SCHED and CONFIG_CGROUP_SCHED are enabled, with: echo 10000 > /proc/sys/kernel/sched_rt_period_us We get this: BUG: unable to handle kernel NULL pointer dereference at 0000008c [ 947.682233] IP: [] __rt_schedulable+0x12/0x160 [ 947.683123] *pde = 00000000=20 [ 947.683782] Oops: 0000 [#1] [ 947.684307] Modules linked in: [ 947.684308] [ 947.684308] Pid: 2359, comm: bash Not tainted (2.6.26-rc6 #8) [ 947.684308] EIP: 0060:[] EFLAGS: 00000246 CPU: 0 [ 947.684308] EIP is at __rt_schedulable+0x12/0x160 [ 947.684308] EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000001 [ 947.684308] ESI: c0521db4 EDI: 00000001 EBP: c6cc9f00 ESP: c6cc9ed0 [ 947.684308] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 [ 947.684308] Process bash (pid: 2359, tiÆcc8000 taskÇa54f00=20 task.tiÆcc8000) [ 947.684308] Stack: c0222790 00000000 080f8c08 c0521db4 c6cc9f00 00000001 00000000 00000000 [ 947.684308] c6cc9f9c 00000000 c0521db4 00000001 c6cc9f28 c0216d40 00000000 00000000 [ 947.684308] c6cc9f9c 000f4240 000e7ef0 ffffffff c0521db4 c79dfb60 c6cc9f58 c02af2cc [ 947.684308] Call Trace: [ 947.684308] [] ? do_proc_dointvec_conv+0x0/0x50 [ 947.684308] [] ? sched_rt_handler+0x80/0x110 [ 947.684308] [] ? proc_sys_call_handler+0x9c/0xb0 [ 947.684308] [] ? proc_sys_write+0x1a/0x20 [ 947.684308] [] ? vfs_write+0x96/0x160 [ 947.684308] [] ? proc_sys_write+0x0/0x20 [ 947.684308] [] ? sys_write+0x3d/0x70 [ 947.684308] [] ? sysenter_past_esp+0x6a/0x91 [ 947.684308] ======================= [ 947.684308] Code: 24 04 e8 62 b1 0e 00 89 c7 89 f8 8b 5d f4 8b 75 f8 8b 7d fc 89 ec 5d c3 90 55 89 e5 57 56 53 83 ec 24 89 45 ec 89 55 e4 89 4d e8 <8b> b8 8c 00 00 00 85 ff 0f 84 c9 00 00 00 8b 57 24 39 55 e8 8b [ 947.684308] EIP: [] __rt_schedulable+0x12/0x160 SS:ESP 0068:c6cc9ed0 We think the following patch solves the issue. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 04228524d16..320e9a43d4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8350,7 +8350,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_CGROUP_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_group *tgi, *parent = tg->parent; + struct task_group *tgi, *parent = tg ? tg->parent : NULL; unsigned long total = 0; if (!parent) { -- cgit v1.2.3 From 7ea56616ba6b3d67a4892728182e38ae162ea3e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 09:06:56 +0200 Subject: sched: rt-group: fix hierarchy Don't re-set the entity's runqueue to the wrong rq after we've set it to the right one. Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 320e9a43d4c..ce375cb551e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7628,7 +7628,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, else rt_se->rt_rq = parent->my_q; - rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); -- cgit v1.2.3 From ad2a3f13b7258a5daaaeb8cff9f835aac468b71d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 09:06:57 +0200 Subject: sched: rt-group: heirarchy aware throttle The bandwidth throttle code dequeues a group when it runs out of quota, and re-queues it once the period rolls over and the quota gets refreshed. Sadly it failed to take the hierarchy into consideration. Share more of the enqueue/dequeue code with regular task opterations. Also, some operations like sched_setscheduler() can dequeue/enqueue tasks that are in throttled runqueues, we should not inadvertly re-enqueue empty runqueues so check for that. Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 59 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d573205..837241568d7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -449,13 +449,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #endif } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && rt_rq_throttled(group_rq)) + /* + * Don't enqueue the group if its throttled, or when empty. + * The latter is a consequence of the former when a child group + * get throttled and the current group doesn't have any other + * active members. + */ + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -464,7 +470,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) inc_rt_tasks(rt_se, rt_rq); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -480,11 +486,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct task_struct *p) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) { - struct sched_rt_entity *rt_se, *back = NULL; + struct sched_rt_entity *back = NULL; - rt_se = &p->rt; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; @@ -492,7 +497,26 @@ static void dequeue_rt_stack(struct task_struct *p) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se); + } +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) + __enqueue_rt_entity(rt_se); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + + for_each_sched_rt_entity(rt_se) { + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) + __enqueue_rt_entity(rt_se); } } @@ -506,32 +530,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) if (wakeup) rt_se->timeout = 0; - dequeue_rt_stack(p); - - /* - * enqueue everybody, bottom - up. - */ - for_each_sched_rt_entity(rt_se) - enqueue_rt_entity(rt_se); + enqueue_rt_entity(rt_se); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) { struct sched_rt_entity *rt_se = &p->rt; - struct rt_rq *rt_rq; update_curr_rt(rq); - - dequeue_rt_stack(p); - - /* - * re-enqueue all non-empty rt_rq entities. - */ - for_each_sched_rt_entity(rt_se) { - rt_rq = group_rt_rq(rt_se); - if (rt_rq && rt_rq->rt_nr_running) - enqueue_rt_entity(rt_se); - } + dequeue_rt_entity(rt_se); } /* -- cgit v1.2.3 From 15a8641eadb492ef7c5489faa25256967bdfd303 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 09:06:59 +0200 Subject: sched: rt-group: fix RR buglet In tick_task_rt() we first call update_curr_rt() which can dequeue a runqueue due to it running out of runtime, and then we try to requeue it, of it also having exhausted its RR quota. Obviously requeueing something that is no longer on the runqueue will not have the expected result. Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 837241568d7..1dad5bbb59b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -549,8 +549,10 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (on_rt_rq(rt_se)) + list_move_tail(&rt_se->run_list, queue); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) -- cgit v1.2.3 From f18f982abf183e91f435990d337164c7a43d1e6d Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:01 -0700 Subject: sched: CPU hotplug events must not destroy scheduler domains created by the cpusets First issue is not related to the cpusets. We're simply leaking doms_cur. It's allocated in arch_init_sched_domains() which is called for every hotplug event. So we just keep reallocation doms_cur without freeing it. I introduced free_sched_domains() function that cleans things up. Second issue is that sched domains created by the cpusets are completely destroyed by the CPU hotplug events. For all CPU hotplug events scheduler attaches all CPUs to the NULL domain and then puts them all into the single domain thereby destroying domains created by the cpusets (partition_sched_domains). The solution is simple, when cpusets are enabled scheduler should not create default domain and instead let cpusets do that. Which is exactly what the patch does. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 6 ++++++ kernel/sched.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 039baa4cd90..bceb8955797 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1890,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/sched.c b/kernel/sched.c index ce375cb551e..4a3cb061415 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7237,6 +7237,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) { } +/* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -7384,6 +7396,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -7469,6 +7482,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7487,8 +7501,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } -- cgit v1.2.3 From 30e0e178193d4221abc9926b07a4c7661c7cc4a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 May 2008 10:27:17 +0800 Subject: cpuset: limit the input of cpuset.sched_relax_domain_level We allow the inputs to be [-1 ... SD_LV_MAX), and return -EINVAL for inputs outside this range. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: Paul Jackson Acked-by: Hidetoshi Seto Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 4 ++-- kernel/sched.c | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 039baa4cd90..66103a119bf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1037,8 +1037,8 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { - if ((int)val < 0) - val = -1; + if (val < -1 || val >= SD_LV_MAX) + return -EINVAL; if (val != cs->relax_domain_level) { cs->relax_domain_level = val; diff --git a/kernel/sched.c b/kernel/sched.c index eaf6751e761..bb2c699c9f5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6877,7 +6877,12 @@ static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) { - default_relax_domain_level = simple_strtoul(str, NULL, 0); + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + return 1; } __setup("relax_domain_level=", setup_relax_domain_level); -- cgit v1.2.3 From afd38009cc3acd36d41f349a669ad5825d695b1f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 14:18:17 -0400 Subject: rcupreempt: remove export of rcu_batches_completed_bh In rcupreempt, rcu_batches_completed_bh is defined as a static inline in the header file. This does not need to be exported, and not only that, this breaks my PPC build. Signed-off-by: Steven Rostedt Cc: "Paul E. McKenney" Cc: paulus@samba.org Cc: linuxppc-dev@ozlabs.org Signed-off-by: Thomas Gleixner --- kernel/rcupreempt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a51..5e02b774070 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; -- cgit v1.2.3 From 9c106c119ebedf624fbd682fd2a4d52e3c8c1a67 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 27 May 2008 12:23:29 -0500 Subject: softlockup: fix NMI hangs due to lock race - 2.6.26-rc regression The touch_nmi_watchdog() routine on x86 ultimately calls touch_softlockup_watchdog(). The problem is that to touch the softlockup watchdog, the cpu_clock code has to be called which could involve multiple cpu locks and can lead to a hard hang if one of the locks is held by a processor that is not going to return anytime soon (such as could be the case with kgdb or perhaps even with some other kind of exception). This patch causes the public version of the touch_softlockup_watchdog() to defer the cpu clock access to a later point. The test case for this problem is to use the following kernel config options: CONFIG_KGDB_TESTS=y CONFIG_KGDB_TESTS_ON_BOOT=y CONFIG_KGDB_TESTS_BOOT_STRING="V1F100I100000" It should be noted that kgdb test suite and these options were not available until 2.6.26-rc2, so it was necessary to patch the kgdb test suite during the bisection. I would consider this patch a regression fix because the problem first appeared in commit 27ec4407790d075c325e1f4da0a19c56953cce23 when some logic was added to try to periodically sync the clocks. It was possible to work around this particular problem by simply not performing the sync anytime the system was in a critical context. This was ok until commit 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd, which added config option CONFIG_HAVE_UNSTABLE_SCHED_CLOCK and some multi-cpu locks to sync the clocks. It became clear that accessing this code from an nmi was the source of the lockups. Avoiding the access to the low level clock code from an code inside the NMI processing also fixed the problem with the 27ec44... commit. Signed-off-by: Jason Wessel Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 01b6522fd92..c828c2339cc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -void touch_softlockup_watchdog(void) +static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); } + +void touch_softlockup_watchdog(void) +{ + __raw_get_cpu_var(touch_timestamp) = 0; +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -80,7 +85,7 @@ void softlockup_tick(void) unsigned long now; if (touch_timestamp == 0) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -95,7 +100,7 @@ void softlockup_tick(void) /* do not print during early bootup: */ if (unlikely(system_state != SYSTEM_RUNNING)) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -214,7 +219,7 @@ static int watchdog(void *__bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); /* initialize timestamp */ - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); set_current_state(TASK_INTERRUPTIBLE); /* @@ -223,7 +228,7 @@ static int watchdog(void *__bind_cpu) * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) -- cgit v1.2.3 From 31a72bce0bd6f3e0114009288bccbc96376eeeca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jun 2008 09:26:49 -0700 Subject: rcu: make rcutorture more vicious: reinstate boot-time testing This patch re-institutes the ability to build rcutorture directly into the Linux kernel. The reason that this capability was removed was that this could result in your kernel being pretty much useless, as rcutorture would be running starting from early boot. This problem has been avoided by (1) making rcutorture run only three seconds of every six by default, (2) adding a CONFIG_RCU_TORTURE_TEST_RUNNABLE that permits rcutorture to be quiesced at boot time, and (3) adding a sysctl in /proc named /proc/sys/kernel/rcutorture_runnable that permits rcutorture to be quiesced and unquiesced when built into the kernel. Please note that this /proc file is -not- available when rcutorture is built as a module. Please also note that to get the earlier take-no-prisoners behavior, you must use the boot command line to set rcutorture's "stutter" parameter to zero. The rcutorture quiescing mechanism is currently quite crude: loops in each rcutorture process that poll a global variable once per tick. Suggestions for improvement are welcome. The default action will be to reduce the polling rate to a few times per second. Signed-off-by: Paul E. McKenney Suggested-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 9 ++++++++- kernel/sysctl.c | 13 +++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98ae7d16822..27003e2421c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -125,6 +125,13 @@ static struct list_head rcu_torture_removed; static int stutter_pause_test = 0; +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; + /* * Allocate an element from the rcu_tortures pool. */ @@ -188,7 +195,7 @@ rcu_random(struct rcu_random_state *rrsp) static void rcu_stutter_wait(void) { - while (stutter_pause_test) + while (stutter_pause_test || !rcutorture_runnable) schedule_timeout_interruptible(1); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..c6887cf135c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,6 +82,9 @@ extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifdef CONFIG_RCU_TORTURE_TEST +extern int rcutorture_runnable; +#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -813,6 +816,16 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif +#ifdef CONFIG_RCU_TORTURE_TEST + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rcutorture_runnable", + .data = &rcutorture_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From d4abc238c9f4df8b3216f3e883f5d0a07b7ac75a Mon Sep 17 00:00:00 2001 From: Bharath Ravi Date: Mon, 16 Jun 2008 15:11:01 +0530 Subject: sched, delay accounting: fix incorrect delay time when constantly waiting on runqueue This patch corrects the incorrect value of per process run-queue wait time reported by delay statistics. The anomaly was due to the following reason. When a process leaves the CPU and immediately starts waiting for CPU on the runqueue (which means it remains in the TASK_RUNNABLE state), the time of re-entry into the run-queue is never recorded. Due to this, the waiting time on the runqueue from this point of re-entry upto the next time it hits the CPU is not accounted for. This is solved by recording the time of re-entry of a process leaving the CPU in the sched_info_depart() function IF the process will go back to waiting on the run-queue. This IF condition is verified by checking whether the process is still in the TASK_RUNNABLE state. The patch was tested on 2.6.26-rc6 using two simple CPU hog programs. The values noted prior to the fix did not account for the time spent on the runqueue waiting. After the fix, the correct values were reported back to user space. Signed-off-by: Bharath Ravi Signed-off-by: Madhava K R Cc: dhaval@linux.vnet.ibm.com Cc: vatsa@in.ibm.com Cc: balbir@in.ibm.com Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index a38878e0e49..80179ef7450 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -198,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t) /* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { @@ -206,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t) t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); } /* -- cgit v1.2.3 From ada18de2eb76961a4d4847f63291744c9e7beec4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:24 +0200 Subject: sched: debug: add some rt debug output Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched_rt.c | 14 ++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8bb713040ac..8e077b9c91c 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#else +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; @@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cgroup_path(cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", @@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); } +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = rt_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = &per_cpu(runqueues, cpu); @@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu) #undef PN print_cfs_stats(m, cpu); + print_rt_stats(m, cpu); print_rq(m, rq, cpu); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fee5fa7c72d..2e0ccdcf046 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1444,3 +1444,17 @@ static const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +static void print_rt_stats(struct seq_file *m, int cpu) +{ + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) + print_rt_rq(m, cpu, rt_rq); + rcu_read_unlock(); +} +#endif -- cgit v1.2.3 From b79f3833d81d54fc71d98c8064dc45f33a755a8a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:25 +0200 Subject: sched: rt: fix SMP bandwidth balancing for throttled groups Now we exceed the runtime and get throttled - the period rollover tick will subtract the cpu quota from the runtime and check if we're below quota. However with this cpu having a very small portion of the runtime it will not refresh as fast as it should. Therefore, also rebalance the runtime when we're throttled. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e0ccdcf046..87b2e3bf947 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,6 +228,28 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif +#ifdef CONFIG_SMP +static int do_balance_runtime(struct rt_rq *rt_rq); + +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} +#endif + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; @@ -247,6 +269,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) u64 runtime; spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { @@ -267,7 +291,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } #ifdef CONFIG_SMP -static int balance_runtime(struct rt_rq *rt_rq) +static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -428,17 +452,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) return 0; -#ifdef CONFIG_SMP - if (rt_rq->rt_time > runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - - runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - } -#endif + balance_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; if (rt_rq->rt_time > runtime) { rt_rq->rt_throttled = 1; -- cgit v1.2.3 From eff6549b957d15d1ad168d90b8c1eb643b9c163f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:26 +0200 Subject: sched: rt: move some code around Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 119 ++++++++++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 87b2e3bf947..61d52112289 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,68 +228,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif -#ifdef CONFIG_SMP -static int do_balance_runtime(struct rt_rq *rt_rq); - -static int balance_runtime(struct rt_rq *rt_rq) -{ - int more = 0; - - if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - } - - return more; -} -#else -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} -#endif - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -{ - int i, idle = 1; - cpumask_t span; - - if (rt_b->rt_runtime == RUNTIME_INF) - return 1; - - span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { - int enqueue = 0; - struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); - struct rq *rq = rq_of_rt_rq(rt_rq); - - spin_lock(&rq->lock); - if (rt_rq->rt_time) { - u64 runtime; - - spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_throttled) - balance_runtime(rt_rq); - runtime = rt_rq->rt_runtime; - rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - enqueue = 1; - } - if (rt_rq->rt_time || rt_rq->rt_nr_running) - idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - } - - if (enqueue) - sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); - } - - return idle; -} - #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) { @@ -425,8 +363,65 @@ static void enable_runtime(struct rq *rq) spin_unlock_irqrestore(&rq->lock, flags); } +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} #endif +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From 10b612f440a22a294e87ec7e8f03f9eea3338628 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:27 +0200 Subject: sched: rt: fix the bandwidth contraint computations Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5b307da827e..1f711a58a2b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8419,7 +8419,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_CGROUP_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_group *tgi, *parent = tg ? tg->parent : NULL; + struct task_group *tgi, *parent = tg->parent; unsigned long total = 0; if (!parent) { @@ -8443,7 +8443,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) } rcu_read_unlock(); - return total + to_ratio(period, runtime) < + return total + to_ratio(period, runtime) <= to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), parent->rt_bandwidth.rt_runtime); } @@ -8560,10 +8560,15 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { + struct task_group *tg = &root_task_group; + u64 rt_runtime, rt_period; int ret = 0; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = tg->rt_bandwidth.rt_runtime; + mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(NULL, 1, 0)) + if (!__rt_schedulable(tg, rt_period, rt_runtime)) ret = -EINVAL; mutex_unlock(&rt_constraints_mutex); -- cgit v1.2.3 From 6c3df25511c2c51f2dd36cc52a8d22363d731793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Cc: Peter Zijlstra Reported-by: "Daniel K." --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 61d52112289..bd90c8bb073 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -412,7 +412,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); -- cgit v1.2.3 From 8a8cde163ea724baf74e7752a31a69d3121a240e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Reported-by: "Daniel K." Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b..0f3c19197fa 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); -- cgit v1.2.3 From 443cd507ce7f78c6f8742b72736585c031d5a921 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Fri, 20 Jun 2008 16:39:21 +0800 Subject: lockdep: add lock_class information to lock_chain and output it This patch records array of lock_class into lock_chain, and export lock_chain information via /proc/lockdep_chains. It is based on x86/master branch of git-x86 tree, and has been tested on x86_64 platform. Signed-off-by: Huang Ying Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 38 +++++++++++++++++-- kernel/lockdep_internals.h | 6 +++ kernel/lockdep_proc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f08..a796f1f38ac 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1458,7 +1458,14 @@ out_bug: } unsigned long nr_lock_chains; -static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +atomic_t nr_chain_hlocks; +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) +{ + return lock_classes + chain_hlocks[chain->base + i]; +} /* * Look up a dependency chain. If the key is not present yet then @@ -1466,10 +1473,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; * validated. If the key is already hashed, return 0. * (On return with 1 graph_lock is held.) */ -static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) +static inline int lookup_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) { + struct lock_class *class = hlock->class; struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; + struct held_lock *hlock_curr, *hlock_next; + int i, j, n; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -1517,6 +1529,26 @@ cache_hit: } chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; + chain->irq_context = hlock->irq_context; + /* Find the first held_lock of current chain */ + hlock_next = hlock; + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock_next->irq_context) + break; + hlock_next = hlock; + } + i++; + chain->depth = curr->lockdep_depth + 1 - i; + n = atomic_add_return(chain->depth, &nr_chain_hlocks); + if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = n - chain->depth; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class - lock_classes; + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; + } list_add_tail_rcu(&chain->entry, hash_head); debug_atomic_inc(&chain_lookup_misses); inc_chains(); @@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * graph_lock for us) */ if (!hlock->trylock && (hlock->check == 2) && - lookup_chain_cache(chain_key, hlock->class)) { + lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: * diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 8ce09bc4613..db09b176dd3 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -23,6 +23,8 @@ #define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) + /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. @@ -30,15 +32,19 @@ #define MAX_STACK_TRACE_ENTRIES 262144UL extern struct list_head all_lock_classes; +extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); + extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; extern unsigned long nr_lock_chains; +extern atomic_t nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 688c5f1940b..14d052c8a83 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -178,6 +178,93 @@ static const struct file_operations proc_lockdep_operations = { .release = seq_release, }; +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_chain *chain; + + (*pos)++; + + if (v == SEQ_START_TOKEN) + chain = m->private; + else { + chain = v; + + if (*pos < nr_lock_chains) + chain = lock_chains + *pos; + else + chain = NULL; + } + + return chain; +} + +static void *lc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return SEQ_START_TOKEN; + + if (*pos < nr_lock_chains) + return lock_chains + *pos; + + return NULL; +} + +static void lc_stop(struct seq_file *m, void *v) +{ +} + +static int lc_show(struct seq_file *m, void *v) +{ + struct lock_chain *chain = v; + struct lock_class *class; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "all lock chains:\n"); + return 0; + } + + seq_printf(m, "irq_context: %d\n", chain->irq_context); + + for (i = 0; i < chain->depth; i++) { + class = lock_chain_get_class(chain, i); + seq_printf(m, "[%p] ", class->key); + print_name(m, class); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_chains_ops = { + .start = lc_start, + .next = lc_next, + .stop = lc_stop, + .show = lc_show, +}; + +static int lockdep_chains_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &lockdep_chains_ops); + if (!res) { + struct seq_file *m = file->private_data; + + if (nr_lock_chains) + m->private = lock_chains; + else + m->private = NULL; + } + return res; +} + +static const struct file_operations proc_lockdep_chains_operations = { + .open = lockdep_chains_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void lockdep_stats_debug_show(struct seq_file *m) { #ifdef CONFIG_DEBUG_LOCKDEP @@ -294,6 +381,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); + seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", + atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -661,6 +750,8 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create("lockdep_chains", S_IRUSR, NULL, + &proc_lockdep_chains_operations); proc_create("lockdep_stats", S_IRUSR, NULL, &proc_lockdep_stats_operations); -- cgit v1.2.3 From bb10ed0994927d433f6dbdf274fdb26cfcf516b7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 19 Jun 2008 15:04:07 -0700 Subject: sched: fix wait_for_completion_timeout() spurious failure under heavy load It seems that the current implementaton of wait_for_completion_timeout() has a small problem under very high load for the common pattern: if (!wait_for_completion_timeout(&done, timeout)) /* handle failure */ because the implementation very roughly does (lots of code deleted to show the basic flow): static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { if (x->done) return timeout; do { timeout = schedule_timeout(timeout); if (!timeout) return timeout; } while (!x->done); return timeout; } so if the system is very busy and x->done is not set when do_wait_for_common() is entered, it is possible that the first call to schedule_timeout() returns 0 because the task doing wait_for_completion doesn't get rescheduled for a long time, even if it is woken up early enough. In this case, wait_for_completion_timeout() returns 0 without even checking x->done again, and the code above falls into its failure case purely for scheduler reasons, even if the hardware event or whatever was being waited for happened early enough. It would make sense to add an extra test to do_wait_for() in the timeout case and return 1 if x->done is actually set. A quick audit (not exhaustive) of wait_for_completion_timeout() callers seems to indicate that no one actually cares about the return value in the success case -- they just test for 0 (timed out) versus non-zero (wait succeeded). Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4a3cb061415..577f160131b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4405,6 +4405,16 @@ do_wait_for_common(struct completion *x, long timeout, int state) spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); + + /* + * If the completion has arrived meanwhile + * then return 1 jiffy time left: + */ + if (x->done && !timeout) { + timeout = 1; + break; + } + if (!timeout) { __remove_wait_queue(&x->wait, &wait); return timeout; -- cgit v1.2.3 From 0f476b6d91a1395bda6464e653ce66ea9bea7167 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 18 Jun 2008 09:29:37 +0200 Subject: softirq: remove irqs_disabled warning from local_bh_enable There's no need to use local_irq_save() over local_irq_disable() in the local_bh_enable code since it is a bug to call it with irqs disabled and do_softirq will enable irqs if there is any pending work. Consolidate the code from local_bh_enable and ..._ip to avoid having a disconnect between them in the warnings they trigger that is currently there. Also always trigger the warning on in_irq(), not just in the trace-irqflags case. Signed-off-by: Johannes Berg Cc: Michael Buesch Cc: David Ellingsworth Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/softirq.c | 48 +++++++++++------------------------------------- 1 file changed, 11 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 86775340ef1..2cf2502b7c7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -131,23 +131,17 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -void local_bh_enable(void) +static inline void _local_bh_enable_ip(unsigned long ip) { + WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS - unsigned long flags; - - WARN_ON_ONCE(in_irq()); -#endif - WARN_ON_ONCE(irqs_disabled()); - -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_save(flags); + local_irq_disable(); #endif /* * Are softirqs going to be turned on now: */ if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: @@ -159,40 +153,20 @@ void local_bh_enable(void) dec_preempt_count(); #ifdef CONFIG_TRACE_IRQFLAGS - local_irq_restore(flags); + local_irq_enable(); #endif preempt_check_resched(); } + +void local_bh_enable(void) +{ + _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { -#ifdef CONFIG_TRACE_IRQFLAGS - unsigned long flags; - - WARN_ON_ONCE(in_irq()); - - local_irq_save(flags); -#endif - /* - * Are softirqs going to be turned on now: - */ - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on(ip); - /* - * Keep preemption disabled until we are done with - * softirq processing: - */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); - - if (unlikely(!in_interrupt() && local_softirq_pending())) - do_softirq(); - - dec_preempt_count(); -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_restore(flags); -#endif - preempt_check_resched(); + _local_bh_enable_ip(ip); } EXPORT_SYMBOL(local_bh_enable_ip); -- cgit v1.2.3 From ea71a546706dfdad72462624394e1e472c6bf34f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 20 Jun 2008 18:32:20 +0400 Subject: sched: refactor wait_for_completion_timeout() Simplify the code and fix the boundary condition of wait_for_completion_timeout(,0). We can kill the first __remove_wait_queue() as well. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra --- kernel/sched.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 577f160131b..bebf9788f45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4398,32 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - - /* - * If the completion has arrived meanwhile - * then return 1 jiffy time left: - */ - if (x->done && !timeout) { - timeout = 1; - break; - } - - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched -- cgit v1.2.3 From 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Jun 2008 11:21:58 +0200 Subject: futexes: fix fault handling in futex_lock_pi This patch addresses a very sporadic pi-futex related failure in highly threaded java apps on large SMP systems. David Holmes reported that the pi_state consistency check in lookup_pi_state triggered with his test application. This means that the kernel internal pi_state and the user space futex variable are out of sync. First we assumed that this is a user space data corruption, but deeper investigation revieled that the problem happend because the pi-futex code is not handling a fault in the futex_lock_pi path when the user space variable needs to be fixed up. The fault happens when a fork mapped the anon memory which contains the futex readonly for COW or the page got swapped out exactly between the unlock of the futex and the return of either the new futex owner or the task which was the expected owner but failed to acquire the kernel internal rtmutex. The current futex_lock_pi() code drops out with an inconsistent in case it faults and returns -EFAULT to user space. User space has no way to fixup that state. When we wrote this code we thought that we could not drop the hash bucket lock at this point to handle the fault. After analysing the code again it turned out to be wrong because there are only two tasks involved which might modify the pi_state and the user space variable: - the task which acquired the rtmutex - the pending owner of the pi_state which did not get the rtmutex Both tasks drop into the fixup_pi_state() function before returning to user space. The first task which acquired the hash bucket lock faults in the fixup of the user space variable, drops the spinlock and calls futex_handle_fault() to fault in the page. Now the second task could acquire the hash bucket lock and tries to fixup the user space variable as well. It either faults as well or it succeeds because the first task already faulted the page in. One caveat is to avoid a double fixup. After returning from the fault handling we reacquire the hash bucket lock and check whether the pi_state owner has been modified already. Reported-by: David Holmes Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: David Holmes Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar kernel/futex.c | 93 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) --- kernel/futex.c | 93 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074f..7d1136e97c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) -- cgit v1.2.3 From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:27 +0530 Subject: ftrace: store mcount address in rec->ip Record the address of the mcount call-site. Currently all archs except sparc64 record the address of the instruction following the mcount call-site. Some general cleanups are entailed. Storing mcount addresses in rec->ip enables looking them up in the kprobe hash table later on to check if they're kprobe'd. Signed-off-by: Abhishek Sagar Cc: davem@davemloft.net Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d5bcf69952..f1e9e5c74e6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,8 @@ #include #include +#include + #include "trace.h" /* ftrace_enabled is a method to turn ftrace on or off */ @@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip) } #define FTRACE_ADDR ((long)(ftrace_caller)) -#define MCOUNT_ADDR ((long)(mcount)) static int __ftrace_replace_code(struct dyn_ftrace *rec, -- cgit v1.2.3 From ecea656d1d5e912d2f3d332657ea4a6d8380f891 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:53 +0530 Subject: ftrace: freeze kprobe'd records Let records identified as being kprobe'd be marked as "frozen". The trouble with records which have a kprobe installed on their mcount call-site is that they don't get updated. So if such a function which is currently being traced gets its tracing disabled due to a new filter rule (or because it was added to the notrace list) then it won't be updated and continue being traced. This patch allows scanning of all frozen records during tracing to check if they should be traced. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 3 +++ 2 files changed, 74 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1e9e5c74e6..d1238163155 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,6 +163,8 @@ enum { }; static int ftrace_filtered; +static int tracing_on; +static int frozen_record_count; static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; @@ -195,6 +197,71 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; + +#ifdef CONFIG_KPROBES +static inline void freeze_record(struct dyn_ftrace *rec) +{ + if (!(rec->flags & FTRACE_FL_FROZEN)) { + rec->flags |= FTRACE_FL_FROZEN; + frozen_record_count++; + } +} + +static inline void unfreeze_record(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_FROZEN) { + rec->flags &= ~FTRACE_FL_FROZEN; + frozen_record_count--; + } +} + +static inline int record_frozen(struct dyn_ftrace *rec) +{ + return rec->flags & FTRACE_FL_FROZEN; +} +#else +# define freeze_record(rec) ({ 0; }) +# define unfreeze_record(rec) ({ 0; }) +# define record_frozen(rec) ({ 0; }) +#endif /* CONFIG_KPROBES */ + +int skip_trace(unsigned long ip) +{ + unsigned long fl; + struct dyn_ftrace *rec; + struct hlist_node *t; + struct hlist_head *head; + + if (frozen_record_count == 0) + return 0; + + head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; + hlist_for_each_entry_rcu(rec, t, head, node) { + if (rec->ip == ip) { + if (record_frozen(rec)) { + if (rec->flags & FTRACE_FL_FAILED) + return 1; + + if (!(rec->flags & FTRACE_FL_CONVERTED)) + return 1; + + if (!tracing_on || !ftrace_enabled) + return 1; + + if (ftrace_filtered) { + fl = rec->flags & (FTRACE_FL_FILTER | + FTRACE_FL_NOTRACE); + if (!fl || (fl & FTRACE_FL_NOTRACE)) + return 1; + } + } + break; + } + } + + return 0; +} + static inline int ftrace_ip_in_hash(unsigned long ip, unsigned long key) { @@ -489,8 +556,11 @@ static int __ftrace_modify_code(void *data) */ __ftrace_update_code(NULL); ftrace_replace_code(1); - } else if (*command & FTRACE_DISABLE_CALLS) + tracing_on = 1; + } else if (*command & FTRACE_DISABLE_CALLS) { ftrace_replace_code(0); + tracing_on = 0; + } if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6e9dae7eb41..9ade79369bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -988,6 +988,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!tracer_enabled)) return; + if (skip_trace(ip)) + return; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; -- cgit v1.2.3 From f22f9a89ce6857d377bf22dba4c1a8cd256c5136 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:50:29 +0530 Subject: ftrace: avoid modifying kprobe'd records Avoid modifying the mcount call-site if there is a kprobe installed on it. These records are not marked as failed however. This allowed the filter rules on them to remain up-to-date. Whenever the kprobe on the corresponding record is removed, the record gets updated as normal. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1238163155..85e84133541 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -500,6 +501,10 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FAILED) continue; + /* ignore updates to this record's mcount site */ + if (get_kprobe((void *)rec->ip)) + continue; + failed = __ftrace_replace_code(rec, old, new, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; @@ -692,11 +697,11 @@ unsigned long ftrace_update_tot_cnt; static int __ftrace_update_code(void *ignore) { + int i, save_ftrace_enabled; + cycle_t start, stop; struct dyn_ftrace *p; struct hlist_node *t, *n; - int save_ftrace_enabled; - cycle_t start, stop; - int i; + struct hlist_head *head, temp_list; /* Don't be recording funcs now */ ftrace_record_suspend++; @@ -708,8 +713,11 @@ static int __ftrace_update_code(void *ignore) /* No locks needed, the machine is stopped! */ for (i = 0; i < FTRACE_HASHSIZE; i++) { + INIT_HLIST_HEAD(&temp_list); + head = &ftrace_hash[i]; + /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) { + hlist_for_each_entry_safe(p, t, n, head, node) { /* Skip over failed records which have not been * freed. */ if (p->flags & FTRACE_FL_FAILED) @@ -723,6 +731,19 @@ static int __ftrace_update_code(void *ignore) if (p->flags & (FTRACE_FL_CONVERTED)) break; + /* Ignore updates to this record's mcount site. + * Reintroduce this record at the head of this + * bucket to attempt to "convert" it again if + * the kprobe on it is unregistered before the + * next run. */ + if (get_kprobe((void *)p->ip)) { + ftrace_del_hash(p); + INIT_HLIST_NODE(&p->node); + hlist_add_head(&p->node, &temp_list); + continue; + } + + /* convert record (i.e, patch mcount-call with NOP) */ if (ftrace_code_disable(p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; @@ -734,6 +755,12 @@ static int __ftrace_update_code(void *ignore) } } } + + hlist_for_each_entry_safe(p, t, n, &temp_list, node) { + hlist_del(&p->node); + INIT_HLIST_NODE(&p->node); + hlist_add_head(&p->node, head); + } } stop = ftrace_now(raw_smp_processor_id()); -- cgit v1.2.3 From 961ccddd59d627b89bd3dc284b6517833bbdf25d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 23 Jun 2008 13:55:38 +1000 Subject: sched: add new API sched_setscheduler_nocheck: add a flag to control access checks Hidehiro Kawai noticed that sched_setscheduler() can fail in stop_machine: it calls sched_setscheduler() from insmod, which can have CAP_SYS_MODULE without CAP_SYS_NICE. Two cases could have failed, so are changed to sched_setscheduler_nocheck: kernel/softirq.c:cpu_callback() - CPU hotplug callback kernel/stop_machine.c:__stop_machine_run() - Called from various places, including modprobe() Signed-off-by: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Hidehiro Kawai Cc: Andrew Morton Cc: linux-mm@kvack.org Cc: sugita Cc: Satoshi OSHIMA Signed-off-by: Ingo Molnar --- kernel/sched.c | 48 ++++++++++++++++++++++++++++++++++++------------ kernel/softirq.c | 2 +- kernel/stop_machine.c | 2 +- 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b048ad8a11a..8d7c246ab86 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4746,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4787,7 +4779,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -4823,7 +4815,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (user + && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -4872,8 +4865,39 @@ recheck: return 0; } + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} EXPORT_SYMBOL_GPL(sched_setscheduler); +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e06174004..afd9120c2fc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -645,7 +645,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler(p, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); kthread_stop(p); takeover_tasklets(hotcpu); break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7350bbfb07..ba9b2054ecb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -187,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler(p, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); wake_up_process(p); wait_for_completion(&smdata.done); -- cgit v1.2.3 From cd1a28e8457e6ebf72c48d84d4e736307e86436e Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Mon, 23 Jun 2008 11:20:54 +0800 Subject: lockdep: add lock_class information to lock_chain and output it It is based on x86/master branch of git-x86 tree, and has been tested on x86_64 platform. ChangeLog: v2: - Enclosing proc file system related code into CONFIG_PROVE_LOCKING. - Fix nr_chain_hlocks update code. Signed-off-by: Huang Ying Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 16 +++++++++++----- kernel/lockdep_internals.h | 2 +- kernel/lockdep_proc.c | 6 +++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index a796f1f38ac..7553a28b99c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1459,7 +1459,7 @@ out_bug: unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -atomic_t nr_chain_hlocks; +int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) @@ -1481,7 +1481,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; - int i, j, n; + int i, j, n, cn; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -1540,9 +1540,15 @@ cache_hit: } i++; chain->depth = curr->lockdep_depth + 1 - i; - n = atomic_add_return(chain->depth, &nr_chain_hlocks); - if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = n - chain->depth; + cn = nr_chain_hlocks; + while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { + n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); + if (n == cn) + break; + cn = n; + } + if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = cn; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class - lock_classes; chain_hlocks[chain->base + j] = lock_id; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index db09b176dd3..c3600a091a2 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -44,7 +44,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; extern unsigned long nr_lock_chains; -extern atomic_t nr_chain_hlocks; +extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 14d052c8a83..9b0e940e254 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -178,6 +178,7 @@ static const struct file_operations proc_lockdep_operations = { .release = seq_release, }; +#ifdef CONFIG_PROVE_LOCKING static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { struct lock_chain *chain; @@ -264,6 +265,7 @@ static const struct file_operations proc_lockdep_chains_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) { @@ -382,7 +384,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS); + nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -750,8 +752,10 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); +#ifdef CONFIG_PROVE_LOCKING proc_create("lockdep_chains", S_IRUSR, NULL, &proc_lockdep_chains_operations); +#endif proc_create("lockdep_stats", S_IRUSR, NULL, &proc_lockdep_stats_operations); -- cgit v1.2.3 From e3d7be270c5b1be07ffadcc8b56599ad8e975c1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Jun 2008 13:06:38 -0700 Subject: rcu, rcutorture: make quiescent rcutorture less power-hungry Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 27003e2421c..0c40c13c540 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -196,7 +196,10 @@ static void rcu_stutter_wait(void) { while (stutter_pause_test || !rcutorture_runnable) - schedule_timeout_interruptible(1); + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(HZ); } /* -- cgit v1.2.3 From 3ccf79f4570acacfefc51772e8f9207895b35ad7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Jun 2008 14:02:55 -0700 Subject: rcu: make quiescent rcutorture less power-hungry This patch aligns the rcutorture wakeup times to align with all other multiple-of-a-second wakeups to further decrease power consumption. Suggested-by: Arjan van de Ven Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0c40c13c540..5e954edf0ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -199,7 +199,7 @@ rcu_stutter_wait(void) if (rcutorture_runnable) schedule_timeout_interruptible(1); else - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); } /* -- cgit v1.2.3 From aabdc3b8c3b3d081f1532454e344208338478e29 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 24 Jun 2008 10:52:55 -0500 Subject: kgdb: sparse fix - Fix warning reported by sparse kernel/kgdb.c:1502:6: warning: symbol 'kgdb_console_write' was not declared. Should it be static? Signed-off-by: Jason Wessel --- kernel/kgdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90113c..3ec23c3ec97 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -void kgdb_console_write(struct console *co, const char *s, unsigned count) +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) { unsigned long flags; -- cgit v1.2.3 From 13d5ef97f0675d789f559cfebc1df9d5e2b1879c Mon Sep 17 00:00:00 2001 From: Peng Haitao Date: Fri, 16 May 2008 10:15:04 +0800 Subject: [PATCH] kernel/audit.c: nlh->nlmsg_type is gotten more than once The first argument "nlh->nlmsg_type" of audit_receive_filter() should be modified to "msg_type" in audit_receive_msg(). Signed-off-by: Peng Haitao Signed-off-by: Al Viro --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e8692a5748c..56f30287e24 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; -- cgit v1.2.3 From 9f0aecdd1cd6aacee9aa8f08031f4f2e09e454dc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 19 May 2008 15:09:21 -0700 Subject: [PATCH] audit: fix kernel-doc parameter notation Fix auditfilter kernel-doc misssing parameter description: Warning(lin2626-rc3//kernel/auditfilter.c:1551): No description found for parameter 'sessionid' Signed-off-by: Randy Dunlap Signed-off-by: Al Viro --- kernel/auditfilter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0e0bd27e651..75cdf262851 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @data: payload data * @datasz: size of payload data * @loginuid: loginuid of sender + * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, -- cgit v1.2.3 From d8de72473effd674a3c1fe9621821f406f5587c9 Mon Sep 17 00:00:00 2001 From: Peng Haitao Date: Tue, 20 May 2008 09:13:02 +0800 Subject: [PATCH] remove useless argument type in audit_filter_user() The second argument "type" is not used in audit_filter_user(), so I think that type can be removed. If I'm wrong, please tell me. Signed-off-by: Peng Haitao Signed-off-by: Al Viro --- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 56f30287e24..e092f1c0ce3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb), msg_type); + err = audit_filter_user(&NETLINK_CB(skb)); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 75cdf262851..98c50cc671b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1721,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb, int type) +int audit_filter_user(struct netlink_skb_parms *cb) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; -- cgit v1.2.3 From 0729fbf3bc70870370b4f43d652f05a468dc68b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Jun 2008 12:24:52 -0700 Subject: rcu: make rcutorture even more vicious: invoke RCU readers from irq handlers (timers) This patch allows torturing RCU from irq handlers (timers, in this case). A new module parameter irqreader enables such additional torturing, and is enabled by default. Variants of RCU that do not tolerate readers being called from irq handlers (e.g., SRCU) ignore irqreader. Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 5e954edf0ed..90b5b123f7a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -59,6 +59,7 @@ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ +static int irqreader = 1; /* RCU readers from irq (timers). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -75,6 +76,8 @@ module_param(shuffle_interval, int, 0444); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(stutter, int, 0444); MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -121,6 +124,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_timers = 0; static struct list_head rcu_torture_removed; static int stutter_pause_test = 0; @@ -217,6 +221,7 @@ struct rcu_torture_ops { void (*sync)(void); void (*cb_barrier)(void); int (*stats)(char *page); + int irqcapable; char *name; }; static struct rcu_torture_ops *cur_ops = NULL; @@ -291,6 +296,7 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .cb_barrier = rcu_barrier, .stats = NULL, + .irqcapable = 1, .name = "rcu" }; @@ -331,6 +337,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .sync = synchronize_rcu, .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_sync" }; @@ -392,6 +399,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .sync = rcu_bh_torture_synchronize, .cb_barrier = rcu_barrier_bh, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh" }; @@ -406,6 +414,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .sync = rcu_bh_torture_synchronize, .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh_sync" }; @@ -532,6 +541,7 @@ static struct rcu_torture_ops sched_ops = { .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, .stats = NULL, + .irqcapable = 1, .name = "sched" }; @@ -619,6 +629,52 @@ rcu_torture_fakewriter(void *arg) return 0; } +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + static DEFINE_RCU_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->readdelay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = cur_ops->completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + cur_ops->readunlock(idx); +} + /* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -633,11 +689,18 @@ rcu_torture_reader(void *arg) DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; + struct timer_list t; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); + if (irqreader && cur_ops->irqcapable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); do { + if (irqreader && cur_ops->irqcapable) { + if (!timer_pending(&t)) + mod_timer(&t, 1); + } idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); @@ -669,6 +732,8 @@ rcu_torture_reader(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + if (irqreader && cur_ops->irqcapable) + del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -699,14 +764,15 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d", + "rtmbe: %d nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror)); + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); @@ -862,10 +928,10 @@ rcu_torture_print_module_parms(char *tag) printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d\n", + "shuffle_interval=%d stutter=%d irqreader=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter); + stutter, irqreader); } static void -- cgit v1.2.3 From 3d4422332711ef48ef0f132f1fcbfcbd56c7f3d1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 11:21:34 +0200 Subject: Add generic helpers for arch IPI function calls This adds kernel/smp.c which contains helpers for IPI function calls. In addition to supporting the existing smp_call_function() in a more efficient manner, it also adds a more scalable variant called smp_call_function_single() for calling a given function on a single CPU only. The core of this is based on the x86-64 patch from Nick Piggin, lots of changes since then. "Alan D. Brunelle" has contributed lots of fixes and suggestions as well. Also thanks to Paul E. McKenney for reviewing RCU usage and getting rid of the data allocation fallback deadlock. Acked-by: Ingo Molnar Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- kernel/Makefile | 1 + kernel/smp.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 384 insertions(+) create mode 100644 kernel/smp.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..9fa57976f25 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/smp.c b/kernel/smp.c new file mode 100644 index 00000000000..f77b75c027a --- /dev/null +++ b/kernel/smp.c @@ -0,0 +1,383 @@ +/* + * Generic helpers for smp ipi calls + * + * (C) Jens Axboe 2008 + * + */ +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); +static LIST_HEAD(call_function_queue); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +enum { + CSD_FLAG_WAIT = 0x01, + CSD_FLAG_ALLOC = 0x02, +}; + +struct call_function_data { + struct call_single_data csd; + spinlock_t lock; + unsigned int refs; + cpumask_t cpumask; + struct rcu_head rcu_head; +}; + +struct call_single_queue { + struct list_head list; + spinlock_t lock; +}; + +void __cpuinit init_call_single_data(void) +{ + int i; + + for_each_possible_cpu(i) { + struct call_single_queue *q = &per_cpu(call_single_queue, i); + + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->list); + } +} + +static void csd_flag_wait(struct call_single_data *data) +{ + /* Wait for response */ + do { + /* + * We need to see the flags store in the IPI handler + */ + smp_mb(); + if (!(data->flags & CSD_FLAG_WAIT)) + break; + cpu_relax(); + } while (1); +} + +/* + * Insert a previously allocated call_single_data element for execution + * on the given CPU. data must already have ->func, ->info, and ->flags set. + */ +static void generic_exec_single(int cpu, struct call_single_data *data) +{ + struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); + int wait = data->flags & CSD_FLAG_WAIT, ipi; + unsigned long flags; + + spin_lock_irqsave(&dst->lock, flags); + ipi = list_empty(&dst->list); + list_add_tail(&data->list, &dst->list); + spin_unlock_irqrestore(&dst->lock, flags); + + if (ipi) + arch_send_call_function_single_ipi(cpu); + + if (wait) + csd_flag_wait(data); +} + +static void rcu_free_call_data(struct rcu_head *head) +{ + struct call_function_data *data; + + data = container_of(head, struct call_function_data, rcu_head); + + kfree(data); +} + +/* + * Invoked by arch to handle an IPI for call function. Must be called with + * interrupts disabled. + */ +void generic_smp_call_function_interrupt(void) +{ + struct call_function_data *data; + int cpu = get_cpu(); + + /* + * It's ok to use list_for_each_rcu() here even though we may delete + * 'pos', since list_del_rcu() doesn't clear ->next + */ + rcu_read_lock(); + list_for_each_entry_rcu(data, &call_function_queue, csd.list) { + int refs; + + if (!cpu_isset(cpu, data->cpumask)) + continue; + + data->csd.func(data->csd.info); + + spin_lock(&data->lock); + cpu_clear(cpu, data->cpumask); + WARN_ON(data->refs == 0); + data->refs--; + refs = data->refs; + spin_unlock(&data->lock); + + if (refs) + continue; + + spin_lock(&call_function_lock); + list_del_rcu(&data->csd.list); + spin_unlock(&call_function_lock); + + if (data->csd.flags & CSD_FLAG_WAIT) { + /* + * serialize stores to data with the flag clear + * and wakeup + */ + smp_wmb(); + data->csd.flags &= ~CSD_FLAG_WAIT; + } else + call_rcu(&data->rcu_head, rcu_free_call_data); + } + rcu_read_unlock(); + + put_cpu(); +} + +/* + * Invoked by arch to handle an IPI for call function single. Must be called + * from the arch with interrupts disabled. + */ +void generic_smp_call_function_single_interrupt(void) +{ + struct call_single_queue *q = &__get_cpu_var(call_single_queue); + LIST_HEAD(list); + + /* + * Need to see other stores to list head for checking whether + * list is empty without holding q->lock + */ + smp_mb(); + while (!list_empty(&q->list)) { + unsigned int data_flags; + + spin_lock(&q->lock); + list_replace_init(&q->list, &list); + spin_unlock(&q->lock); + + while (!list_empty(&list)) { + struct call_single_data *data; + + data = list_entry(list.next, struct call_single_data, + list); + list_del(&data->list); + + /* + * 'data' can be invalid after this call if + * flags == 0 (when called through + * generic_exec_single(), so save them away before + * making the call. + */ + data_flags = data->flags; + + data->func(data->info); + + if (data_flags & CSD_FLAG_WAIT) { + smp_wmb(); + data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_ALLOC) + kfree(data); + } + /* + * See comment on outer loop + */ + smp_mb(); + } +} + +/* + * smp_call_function_single - Run a function on a specific CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @retry: Unused + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int retry, int wait) +{ + struct call_single_data d; + unsigned long flags; + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + if (cpu == me) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } else { + struct call_single_data *data = NULL; + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->flags = CSD_FLAG_WAIT; + } + + data->func = func; + data->info = info; + generic_exec_single(cpu, data); + } + + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); + +/** + * __smp_call_function_single(): Run a function on another CPU + * @cpu: The CPU to run on. + * @data: Pre-allocated and setup data structure + * + * Like smp_call_function_single(), but allow caller to pass in a pre-allocated + * data structure. Useful for embedding @data inside other structures, for + * instance. + * + */ +void __smp_call_function_single(int cpu, struct call_single_data *data) +{ + /* Can deadlock when called with interrupts disabled */ + WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); + + generic_exec_single(cpu, data); +} + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned. Note that @wait + * will be implicitly turned on in case of allocation failures, since + * we fall back to on-stack allocation. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, + int wait) +{ + struct call_function_data d; + struct call_function_data *data = NULL; + cpumask_t allbutself; + unsigned long flags; + int cpu, num_cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + cpu = smp_processor_id(); + allbutself = cpu_online_map; + cpu_clear(cpu, allbutself); + cpus_and(mask, mask, allbutself); + num_cpus = cpus_weight(mask); + + /* + * If zero CPUs, return. If just a single CPU, turn this request + * into a targetted single call instead since it's faster. + */ + if (!num_cpus) + return 0; + else if (num_cpus == 1) { + cpu = first_cpu(mask); + return smp_call_function_single(cpu, func, info, 0, wait); + } + + if (!wait) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) + data->csd.flags = CSD_FLAG_ALLOC; + } + if (!data) { + data = &d; + data->csd.flags = CSD_FLAG_WAIT; + } + + spin_lock_init(&data->lock); + data->csd.func = func; + data->csd.info = info; + data->refs = num_cpus; + data->cpumask = mask; + + spin_lock_irqsave(&call_function_lock, flags); + list_add_tail_rcu(&data->csd.list, &call_function_queue); + spin_unlock_irqrestore(&call_function_lock, flags); + + /* Send a message to all CPUs in the map */ + arch_send_call_function_ipi(mask); + + /* optionally wait for the CPUs to complete */ + if (wait) + csd_flag_wait(&data->csd); + + return 0; +} +EXPORT_SYMBOL(smp_call_function_mask); + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @natomic: Unused + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. In case of allocation + * failure, @wait will be implicitly turned on. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func)(void *), void *info, int natomic, int wait) +{ + int ret; + + preempt_disable(); + ret = smp_call_function_mask(cpu_online_map, func, info, wait); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(smp_call_function); + +void ipi_call_lock(void) +{ + spin_lock(&call_function_lock); +} + +void ipi_call_unlock(void) +{ + spin_unlock(&call_function_lock); +} + +void ipi_call_lock_irq(void) +{ + spin_lock_irq(&call_function_lock); +} + +void ipi_call_unlock_irq(void) +{ + spin_unlock_irq(&call_function_lock); +} -- cgit v1.2.3 From 8691e5a8f691cc2a4fda0651e8d307aaba0e7d68 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2008 11:18:06 +0200 Subject: smp_call_function: get rid of the unused nonatomic/retry argument It's never used and the comments refer to nonatomic and retry interchangably. So get rid of it. Acked-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe --- kernel/smp.c | 6 ++---- kernel/softirq.c | 2 +- kernel/time/tick-broadcast.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index f77b75c027a..7e0432a4a0e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -195,7 +195,6 @@ void generic_smp_call_function_single_interrupt(void) * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @retry: Unused * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. Note that @wait @@ -203,7 +202,7 @@ void generic_smp_call_function_single_interrupt(void) * we fall back to on-stack allocation. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int retry, int wait) + int wait) { struct call_single_data d; unsigned long flags; @@ -339,7 +338,6 @@ EXPORT_SYMBOL(smp_call_function_mask); * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @natomic: Unused * @wait: If true, wait (atomically) until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. @@ -351,7 +349,7 @@ EXPORT_SYMBOL(smp_call_function_mask); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(void (*func)(void *), void *info, int natomic, int wait) +int smp_call_function(void (*func)(void *), void *info, int wait) { int ret; diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e06174004..d73afb4764e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -679,7 +679,7 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) int ret = 0; preempt_disable(); - ret = smp_call_function(func, info, retry, wait); + ret = smp_call_function(func, info, wait); local_irq_disable(); func(info); local_irq_enable(); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02e5ec..75e718539dc 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -266,7 +266,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) "offline CPU #%d\n", *oncpu); else smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1, 1); + &reason, 1); } /* -- cgit v1.2.3 From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2008 09:39:44 +0200 Subject: on_each_cpu(): kill unused 'retry' parameter It's not even passed on to smp_call_function() anymore, since that was removed. So kill it. Acked-by: Jeremy Fitzhardinge Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- kernel/hrtimer.c | 2 +- kernel/profile.c | 6 +++--- kernel/rcupdate.c | 2 +- kernel/softirq.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..50e8616d795 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -623,7 +623,7 @@ static void retrigger_next_event(void *arg) void clock_was_set(void) { /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 0, 1); + on_each_cpu(retrigger_next_event, NULL, 1); } /* diff --git a/kernel/profile.c b/kernel/profile.c index ae7ead82cbc..58926411eb2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -252,7 +252,7 @@ static void profile_flip_buffers(void) mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; for (i = 0; i < NR_PROFILE_HIT; ++i) { @@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void) mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 0, 1); + on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); @@ -558,7 +558,7 @@ static int __init create_hash_tables(void) out_cleanup: prof_on = 0; smp_mb(); - on_each_cpu(profile_nop, NULL, 0, 1); + on_each_cpu(profile_nop, NULL, 1); for_each_online_cpu(cpu) { struct page *page; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c09605f8d16..6addab5e6d8 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -127,7 +127,7 @@ void rcu_barrier(void) * until all the callbacks are queued. */ rcu_read_lock(); - on_each_cpu(rcu_barrier_func, NULL, 0, 1); + on_each_cpu(rcu_barrier_func, NULL, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); diff --git a/kernel/softirq.c b/kernel/softirq.c index d73afb4764e..c159fd09477 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -674,7 +674,7 @@ __init int spawn_ksoftirqd(void) /* * Call a function on all processors */ -int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) +int on_each_cpu(void (*func) (void *info), void *info, int wait) { int ret = 0; -- cgit v1.2.3 From ce0d1b6f73870878aae622b72e85fe8f7a16b51c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 11:50:32 +0200 Subject: fix: "smp_call_function: get rid of the unused nonatomic/retry argument" fix: kernel/smp.c: In function 'smp_call_function_mask': kernel/smp.c:303: error: too many arguments to function 'smp_call_function_single' Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7e0432a4a0e..4f582b257eb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -300,7 +300,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, return 0; else if (num_cpus == 1) { cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, 0, wait); + return smp_call_function_single(cpu, func, info, wait); } if (!wait) { -- cgit v1.2.3 From bf647b62fdb948e757a7b4d18d4f16e3c763b1d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:10 +0200 Subject: sched: clean up some unused variables In file included from /mnt/build/linux-2.6/kernel/sched.c:1496: /mnt/build/linux-2.6/kernel/sched_rt.c: In function '__enable_runtime': /mnt/build/linux-2.6/kernel/sched_rt.c:339: warning: unused variable 'rd' /mnt/build/linux-2.6/kernel/sched_rt.c: In function 'requeue_rt_entity': /mnt/build/linux-2.6/kernel/sched_rt.c:692: warning: unused variable 'queue' Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bd90c8bb073..6b4a6b5a416 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -336,7 +336,6 @@ static void disable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq) { - struct root_domain *rd = rq->rd; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) @@ -689,7 +688,6 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - struct list_head *queue = array->queue + rt_se_prio(rt_se); if (on_rt_rq(rt_se)) { list_del_init(&rt_se->run_list); -- cgit v1.2.3 From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:11 +0200 Subject: sched: revert the revert of: weight calculations Try again.. initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3 Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++--- kernel/sched_fair.c | 105 +++++++++++++++++++++++++++++++++--------------- kernel/sched_features.h | 1 + 3 files changed, 76 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c51d9fae8cd..f653af684fb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1fe4c65a817..496500988ce 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, } #endif +/* + * delta *= w / rw + */ +static inline unsigned long +calc_delta_weight(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + se->load.weight, &cfs_rq_of(se)->load); + } + + return delta; +} + +/* + * delta *= rw / w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } + + return delta; +} + /* * The idea is to set a period in which each task runs once. * @@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - } - - - return slice; + return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; - unsigned long weight; - u64 vslice; if (!se->on_rq) nr_running++; - vslice = __sched_period(nr_running); + return __sched_period(nr_running); +} + +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) +{ + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct load_weight *se_lw = &se->load; - weight = cfs_rq->load.weight; - if (!se->on_rq) - weight += se->load.weight; + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; - vslice *= NICE_0_LOAD; - do_div(vslice, weight); + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); } - return vslice; + return delta; } /* @@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; } @@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; + + /* + * convert the sleeper threshold into virtual time + */ + if (sched_feat(NORMALIZED_SLEEPER)) + thresh = calc_delta_fair(thresh, se); + + vruntime -= thresh; + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) unsigned long gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 62b39ca92eb..afa549166d8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,4 +1,5 @@ SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(NORMALIZED_SLEEPER, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) -- cgit v1.2.3 From c9c294a630e28eec5f2865f028ecfc58d45c0a5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:12 +0200 Subject: sched: fix calc_delta_asym() calc_delta_asym() is supposed to do the same as calc_delta_fair() except linearly shrink the result for negative nice processes - this causes them to have a smaller preemption threshold so that they are more easily preempted. The problem is that for task groups se->load.weight is the per cpu share of the actual task group weight; take that into account. Also provide a debug switch to disable the asymmetry (which I still don't like - but it does greatly benefit some workloads) This would explain the interactivity issues reported against group scheduling. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++++++++++++++++- kernel/sched_features.h | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 496500988ce..2268e634812 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,6 +430,29 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; +#ifdef CONFIG_FAIR_SCHED_GROUP + struct cfs_rq *cfs_rq = se->my_q; + struct task_group *tg = NULL + + if (cfs_rq) + tg = cfs_rq->tg; + + if (tg && tg->shares < NICE_0_LOAD) { + /* + * scale shares to what it would have been had + * tg->weight been NICE_0_LOAD: + * + * weight = 1024 * shares / tg->weight + */ + lw.weight *= se->load.weight; + lw.weight /= tg->shares; + + lw.inv_weight = 0; + + se_lw = &lw; + } else +#endif + if (se->load.weight < NICE_0_LOAD) se_lw = &lw; @@ -1154,7 +1177,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + if (sched_feat(ASYM_GRAN)) + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + else + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index afa549166d8..04123af2e67 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -7,3 +7,4 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(ASYM_GRAN, 1) -- cgit v1.2.3 From ced8aa16e1db55c33c507174c1b1f9e107445865 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:13 +0200 Subject: sched: fix calc_delta_asym, #2 Ok, so why are we in this mess, it was: 1/w but now we mixed that rw in the mix like: rw/w rw being \Sum w suggests: fiddling w, we should also fiddle rw, humm? Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2268e634812..2e197b8e43f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -429,6 +429,7 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; + unsigned long rw = cfs_rq_of(se)->load.weight; #ifdef CONFIG_FAIR_SCHED_GROUP struct cfs_rq *cfs_rq = se->my_q; @@ -450,14 +451,16 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) lw.inv_weight = 0; se_lw = &lw; + rw += lw.weight - se->load.weight; } else #endif - if (se->load.weight < NICE_0_LOAD) + if (se->load.weight < NICE_0_LOAD) { se_lw = &lw; + rw += NICE_0_LOAD - se->load.weight; + } - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, se_lw); + delta = calc_delta_mine(delta, rw, se_lw); } return delta; -- cgit v1.2.3 From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:14 +0200 Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 430 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_debug.c | 5 + kernel/sched_fair.c | 124 +++++++++------ kernel/sched_rt.c | 4 + 4 files changed, 488 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb..874b6da1543 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -403,6 +403,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + shares = tg->shares; + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + #endif #include "sched_stats.h" @@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; + int unlock_aggregate; cpus_setall(*cpus); + unlock_aggregate = get_aggregate(sd); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3383,8 +3731,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3399,8 +3748,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; + sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7757,6 +8113,7 @@ void __init sched_init(void) } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif @@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); + } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8e077b9c91c..04394ccac88 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); +#endif +#endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e197b8e43f..183388c4dea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -580,6 +597,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); + struct rq_iterator cfs_rq_iterator; - p = task_of(curr); + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; + + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) + continue; + + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; + + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + imbalance = (busiest_weight - this_weight) / 2; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + move_group_shares(tg, sd, busiest_cpu, this_cpu); - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6b4a6b5a416..765932d0399 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) rt_se->timeout = 0; enqueue_rt_entity(rt_se); + + inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + + dec_cpu_load(rq, p->se.load.weight); } /* -- cgit v1.2.3 From 76a2a6ee8a0660a29127f05989ac59ae1ce865fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:15 +0200 Subject: sched: sched_clock_cpu() based cpu_clock() with sched_clock_cpu() being reasonably in sync between cpus (max 1 jiffy difference) use this to provide cpu_clock(). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 76 ---------------------------------------------------- kernel/sched_clock.c | 12 +++++++++ 2 files changed, 12 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 874b6da1543..eb3454c410f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -818,82 +818,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -unsigned long long time_sync_thresh = 100000; - -static DEFINE_PER_CPU(unsigned long long, time_offset); -static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); - -/* - * Global lock which we take every now and then to synchronize - * the CPUs time. This method is not warp-safe, but it's good - * enough to synchronize slowly diverging time sources and thus - * it's good enough for tracing: - */ -static DEFINE_SPINLOCK(time_sync_lock); -static unsigned long long prev_global_time; - -static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) -{ - /* - * We want this inlined, to not get tracer function calls - * in this critical section: - */ - spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); - __raw_spin_lock(&time_sync_lock.raw_lock); - - if (time < prev_global_time) { - per_cpu(time_offset, cpu) += prev_global_time - time; - time = prev_global_time; - } else { - prev_global_time = time; - } - - __raw_spin_unlock(&time_sync_lock.raw_lock); - spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); - - return time; -} - -static unsigned long long __cpu_clock(int cpu) -{ - unsigned long long now; - - /* - * Only call sched_clock() if the scheduler has already been - * initialized (some code might call cpu_clock() very early): - */ - if (unlikely(!scheduler_running)) - return 0; - - now = sched_clock_cpu(cpu); - - return now; -} - -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -unsigned long long cpu_clock(int cpu) -{ - unsigned long long prev_cpu_time, time, delta_time; - unsigned long flags; - - local_irq_save(flags); - prev_cpu_time = per_cpu(prev_cpu_time, cpu); - time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); - delta_time = time-prev_cpu_time; - - if (unlikely(delta_time > time_sync_thresh)) { - time = __sync_cpu_clock(time, cpu); - per_cpu(prev_cpu_time, cpu) = time; - } - local_irq_restore(flags); - - return time; -} -EXPORT_SYMBOL_GPL(cpu_clock); - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219a..3c696db5945 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -244,3 +244,15 @@ unsigned long long __attribute__((weak)) sched_clock(void) { return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } + +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + raw_local_irq_save(flags); + clock = sched_clock_cpu(cpu); + raw_local_irq_restore(flags); + + return clock; +} -- cgit v1.2.3 From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:16 +0200 Subject: sched: fix wakeup granularity and buddy granularity Uncouple buddy selection from wakeup granularity. The initial idea was that buddies could run ahead as far as a normal task can - do this by measuring a pair 'slice' just as we do for a normal task. This means we can drop the wakeup_granularity back to 5ms. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_fair.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index eb3454c410f..7d282c52bd4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -375,6 +375,7 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; + u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 183388c4dea..509092af033 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!cfs_rq->next) - return se; + struct rq *rq = rq_of(cfs_rq); + u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (wakeup_preempt_entity(cfs_rq->next, se) != 0) + if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + cfs_rq->pair_start = rq->clock; return se; + } return cfs_rq->next; } -- cgit v1.2.3 From 32df2ee86a580f70f2dbb90cf81f413aa655f838 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:17 +0200 Subject: sched: add full schedstats to /proc/sched_debug show all the schedstats in /debug/sched_debug as well. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 04394ccac88..bbe6b31c3c5 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,8 +162,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->bkl_count); +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); -- cgit v1.2.3 From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:18 +0200 Subject: sched: fix sched_domain aggregation Keeping the aggregate on the first cpu of the sched domain has two problems: - it could collide between different sched domains on different cpus - it could slow things down because of the remote accesses Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 113 +++++++++++++++++++++++++--------------------------- kernel/sched_fair.c | 12 +++--- 2 files changed, 60 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7d282c52bd4..160d3c209b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); */ static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) +aggregate(struct task_group *tg, int cpu) { - return &tg->cfs_rq[sd->first_cpu]->aggregate; + return &tg->cfs_rq[cpu]->aggregate; } -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); +typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when @@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); */ static void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) + int cpu, struct sched_domain *sd) { struct task_group *parent, *child; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, sd); + (*down)(parent, cpu, sd); list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1508,7 +1508,7 @@ down: up: continue; } - (*up)(parent, sd); + (*up)(parent, cpu, sd); child = parent; parent = parent->parent; @@ -1520,8 +1520,8 @@ up: /* * Calculate the aggregate runqueue weight. */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; unsigned long task_weight = 0; @@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) task_weight += tg->cfs_rq[i]->task_weight; } - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; + aggregate(tg, cpu)->rq_weight = rq_weight; + aggregate(tg, cpu)->task_weight = task_weight; } /* * Compute the weight of this group on the given cpus. */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long shares = 0; int i; @@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) for_each_cpu_mask(i, sd->span) shares += tg->cfs_rq[i]->shares; - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; - aggregate(tg, sd)->shares = shares; + aggregate(tg, cpu)->shares = shares; } /* * Compute the load fraction assigned to this group, relies on the aggregate * weight and this group's parent's load, i.e. top-down. */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long load; @@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) load += cpu_rq(i)->load.weight; } else { - load = aggregate(tg->parent, sd)->load; + load = aggregate(tg->parent, cpu)->load; /* * shares is our weight in the parent's rq so * shares/parent->rq_weight gives our fraction of the load */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; + load *= aggregate(tg, cpu)->shares; + load /= aggregate(tg->parent, cpu)->rq_weight + 1; } - aggregate(tg, sd)->load = load; + aggregate(tg, cpu)->load = load; } static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) +__update_group_shares_cpu(struct task_group *tg, int cpu, + struct sched_domain *sd, int tcpu) { int boost = 0; unsigned long shares; @@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * \Sum rq_weight * */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; + shares = aggregate(tg, cpu)->shares * rq_weight; + shares /= aggregate(tg, cpu)->rq_weight + 1; /* * record the actual number of shares, not the boosted amount. @@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * task went to. */ static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, +__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { unsigned long shares; shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); + __update_group_shares_cpu(tg, cpu, sd, scpu); + __update_group_shares_cpu(tg, cpu, sd, dcpu); /* * ensure we never loose shares due to rounding errors in the @@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd, * we need to walk up the tree and change all shares until we hit the root. */ static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, +move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); + __move_group_shares(tg, cpu, sd, scpu, dcpu); tg = tg->parent; } } -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, sd)->shares; + unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); + __update_group_shares_cpu(tg, cpu, sd, i); spin_unlock_irqrestore(&rq->lock, flags); } - aggregate_group_shares(tg, sd); + aggregate_group_shares(tg, cpu, sd); /* * ensure we never loose shares due to rounding errors in the * above redistribution. */ - shares -= aggregate(tg, sd)->shares; + shares -= aggregate(tg, cpu)->shares; if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; + tg->cfs_rq[cpu]->shares += shares; + aggregate(tg, cpu)->shares += shares; } } @@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) * Calculate the accumulative weight and recursive load of each task group * while walking down the tree. */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); + aggregate_group_weight(tg, cpu, sd); + aggregate_group_shares(tg, cpu, sd); + aggregate_group_load(tg, cpu, sd); } /* * Rebalance the cpu shares while walking back up the tree. */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_set_shares(tg, sd); + aggregate_group_set_shares(tg, cpu, sd); } static DEFINE_PER_CPU(spinlock_t, aggregate_lock); @@ -1731,18 +1731,18 @@ static void __init init_aggregate(void) spin_lock_init(&per_cpu(aggregate_lock, i)); } -static int get_aggregate(struct sched_domain *sd) +static int get_aggregate(int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) return 0; - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); return 1; } -static void put_aggregate(struct sched_domain *sd) +static void put_aggregate(int cpu, struct sched_domain *sd) { - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); + spin_unlock(&per_cpu(aggregate_lock, cpu)); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1756,12 +1756,12 @@ static inline void init_aggregate(void) { } -static inline int get_aggregate(struct sched_domain *sd) +static inline int get_aggregate(int cpu, struct sched_domain *sd) { return 0; } -static inline void put_aggregate(struct sched_domain *sd) +static inline void put_aggregate(int cpu, struct sched_domain *sd) { } #endif @@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); + unlock_aggregate = get_aggregate(this_cpu, sd); /* * When power savings policy is enabled for the parent domain, idle @@ -3678,7 +3678,7 @@ out_one_pinned: ld_moved = 0; out: if (unlock_aggregate) - put_aggregate(sd); + put_aggregate(this_cpu, sd); return ld_moved; } @@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 509092af033..40cf24ab4de 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, sd)->task_weight) + if (!aggregate(tg, this_cpu)->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; + rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; + rem_load /= aggregate(tg, this_cpu)->load + 1; this_weight = tg->cfs_rq[this_cpu]->task_weight; busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; @@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + moved_load *= aggregate(tg, this_cpu)->load; + moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit v1.2.3 From 4d8d595dfa69e1c807bf928f364668a7f30da5dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:19 +0200 Subject: sched: update aggregate when holding the RQs It was observed that in __update_group_shares_cpu() rq_weight > aggregate()->rq_weight This is caused by forks/wakeups in between the initial aggregate pass and locking of the RQs for load balance. To avoid this situation partially re-do the aggregation once we have the RQs locked (which avoids new tasks from appearing). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 160d3c209b8..dae20199dc9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1721,6 +1721,11 @@ aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) aggregate_group_set_shares(tg, cpu, sd); } +static void +aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ +} + static DEFINE_PER_CPU(spinlock_t, aggregate_lock); static void __init init_aggregate(void) @@ -1740,6 +1745,11 @@ static int get_aggregate(int cpu, struct sched_domain *sd) return 1; } +static void update_aggregate(int cpu, struct sched_domain *sd) +{ + aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); +} + static void put_aggregate(int cpu, struct sched_domain *sd) { spin_unlock(&per_cpu(aggregate_lock, cpu)); @@ -1761,6 +1771,10 @@ static inline int get_aggregate(int cpu, struct sched_domain *sd) return 0; } +static inline void update_aggregate(int cpu, struct sched_domain *sd) +{ +} + static inline void put_aggregate(int cpu, struct sched_domain *sd) { } @@ -2192,6 +2206,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + /* + * now that we have both rqs locked the rq weight won't change + * anymore - so update the stats. + */ + update_aggregate(this_cpu, sd); + do { unsigned long load, avg_load; int local_group; -- cgit v1.2.3 From 53fecd8ae1900fb571086f54f664051004665b55 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:20 +0200 Subject: sched: kill task_group balancing The idea was to balance groups until we've reached the global goal, however Vatsa rightly pointed out that we might never reach that goal this way - hence take out this logic. [ the initial rationale for this 'feature' was to promote max concurrency within a group - it does not however affect fairness ] Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 40cf24ab4de..b10c0d61a2a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1422,9 +1422,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; + long rem_load, moved_load; /* * empty group @@ -1435,17 +1433,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; rem_load /= aggregate(tg, this_cpu)->load + 1; - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - - imbalance = (busiest_weight - this_weight) / 2; - - if (imbalance < 0) - imbalance = busiest_weight; - - max_load = max(rem_load, imbalance); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]); if (!moved_load) -- cgit v1.2.3 From d3f40dbab954d83383b6a516582d5c09cc216dcc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:21 +0200 Subject: sched: dont micro manage share losses We used to try and contain the loss of 'shares' by playing arithmetic games. Replace that by noticing that at the top sched_domain we'll always have the full weight in shares to distribute. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dae20199dc9..28229c5d498 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1551,6 +1551,9 @@ aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + aggregate(tg, cpu)->shares = shares; } @@ -1642,20 +1645,8 @@ static void __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { - unsigned long shares; - - shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, cpu, sd, scpu); __update_group_shares_cpu(tg, cpu, sd, dcpu); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - if (shares) - tg->cfs_rq[dcpu]->shares += shares; } /* @@ -1675,7 +1666,6 @@ move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, static void aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1688,16 +1678,6 @@ aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain * } aggregate_group_shares(tg, cpu, sd); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= aggregate(tg, cpu)->shares; - if (shares) { - tg->cfs_rq[cpu]->shares += shares; - aggregate(tg, cpu)->shares += shares; - } } /* -- cgit v1.2.3 From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:22 +0200 Subject: sched: no need to aggregate task_weight We only need to know the task_weight of the busiest rq - nothing to do if there are no tasks there. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 +--------------- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 28229c5d498..716cfc8e099 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -433,12 +433,6 @@ struct cfs_rq { * The sum of all runqueue weights within this span. */ unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; } aggregate; #endif #endif @@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); * rq_weight: * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. */ static inline struct aggregate_struct * @@ -1524,16 +1514,12 @@ static void aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; - unsigned long task_weight = 0; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu_mask(i, sd->span) rq_weight += tg->cfs_rq[i]->load.weight; - task_weight += tg->cfs_rq[i]->task_weight; - } aggregate(tg, cpu)->rq_weight = rq_weight; - aggregate(tg, cpu)->task_weight = task_weight; } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b10c0d61a2a..03b9fbd9d64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, this_cpu)->task_weight) + if (!tg->cfs_rq[busiest_cpu]->task_weight) continue; rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; -- cgit v1.2.3 From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:23 +0200 Subject: sched: simplify the group load balancer While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 286 ++++++++++++---------------------------------------- kernel/sched_fair.c | 15 +-- 2 files changed, 72 insertions(+), 229 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099..f864b751fd1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -406,34 +406,23 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. + * the part of load.weight contributed by tasks */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; + unsigned long task_weight; - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - } aggregate; + /* + * this cpu's part of tg->shares + */ + unsigned long shares; #endif #endif }; @@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, int cpu) -{ - return &tg->cfs_rq[cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - int cpu, struct sched_domain *sd) +static void +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) { struct task_group *parent, *child; @@ -1507,72 +1463,6 @@ up: rcu_read_unlock(); } -/* - * Calculate the aggregate runqueue weight. - */ -static void -aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - int i; - - for_each_cpu_mask(i, sd->span) - rq_weight += tg->cfs_rq[i]->load.weight; - - aggregate(tg, cpu)->rq_weight = rq_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static void -aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) - shares += tg->cfs_rq[i]->shares; - - if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - aggregate(tg, cpu)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static void -aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - int i; - - load = 0; - for_each_cpu_mask(i, sd->span) - load += cpu_rq(i)->load.weight; - - } else { - load = aggregate(tg->parent, cpu)->load; - - /* - * shares is our weight in the parent's rq so - * shares/parent->rq_weight gives our fraction of the load - */ - load *= aggregate(tg, cpu)->shares; - load /= aggregate(tg->parent, cpu)->rq_weight + 1; - } - - aggregate(tg, cpu)->load = load; -} - static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void __update_group_shares_cpu(struct task_group *tg, int cpu, - struct sched_domain *sd, int tcpu) + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; unsigned long rq_weight; - if (!tg->se[tcpu]) + if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[tcpu]->load.weight; + rq_weight = tg->cfs_rq[cpu]->load.weight; /* * If there are currently no tasks on the cpu pretend there is one of @@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, rq_weight = NICE_0_LOAD; } + if (unlikely(rq_weight > sd_rq_weight)) + rq_weight = sd_rq_weight; + /* * \Sum shares * rq_weight * shares = ----------------------- * \Sum rq_weight * */ - shares = aggregate(tg, cpu)->shares * rq_weight; - shares /= aggregate(tg, cpu)->rq_weight + 1; + shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); /* * record the actual number of shares, not the boosted amount. */ - tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; if (shares < MIN_SHARES) shares = MIN_SHARES; else if (shares > MAX_SHARES) shares = MAX_SHARES; - __set_se_shares(tg->se[tcpu], shares); + __set_se_shares(tg->se[cpu], shares); } /* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. */ static void -__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - __update_group_shares_cpu(tg, cpu, sd, scpu); - __update_group_shares_cpu(tg, cpu, sd, dcpu); -} + unsigned long rq_weight = 0; + unsigned long shares = 0; + int i; -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) -{ - while (tg) { - __move_group_shares(tg, cpu, sd, scpu, dcpu); - tg = tg->parent; + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + shares += tg->cfs_rq[i]->shares; } -} -static void -aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - int i; + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, cpu, sd, i); + __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } - - aggregate_group_shares(tg, cpu, sd); } /* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. */ static void -aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, cpu, sd); - aggregate_group_shares(tg, cpu, sd); - aggregate_group_load(tg, cpu, sd); -} - -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static void -aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - aggregate_group_set_shares(tg, cpu, sd); -} - -static void -aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ -} - -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + unsigned long load; -static void __init init_aggregate(void) -{ - int i; + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(aggregate_lock, i)); + tg->cfs_rq[cpu]->h_load = load; } -static int get_aggregate(int cpu, struct sched_domain *sd) +static void +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) - return 0; - - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); - return 1; } -static void update_aggregate(int cpu, struct sched_domain *sd) +static void update_shares(struct sched_domain *sd) { - aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } -static void put_aggregate(int cpu, struct sched_domain *sd) +static void update_h_load(int cpu) { - spin_unlock(&per_cpu(aggregate_lock, cpu)); + walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #else -static inline void init_aggregate(void) -{ -} - -static inline int get_aggregate(int cpu, struct sched_domain *sd) -{ - return 0; -} - -static inline void update_aggregate(int cpu, struct sched_domain *sd) +static inline void update_shares(struct sched_domain *sd) { } -static inline void put_aggregate(int cpu, struct sched_domain *sd) -{ -} #endif #endif @@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; - /* - * now that we have both rqs locked the rq weight won't change - * anymore - so update the stats. - */ - update_aggregate(this_cpu, sd); - do { unsigned long load, avg_load; int local_group; @@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; - int unlock_aggregate; cpus_setall(*cpus); - unlock_aggregate = get_aggregate(this_cpu, sd); - /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: + update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3663,8 +3506,8 @@ out_one_pinned: else ld_moved = 0; out: - if (unlock_aggregate) - put_aggregate(this_cpu, sd); + if (ld_moved) + update_shares(sd); return ld_moved; } @@ -8019,7 +7862,6 @@ void __init sched_init(void) } #ifdef CONFIG_SMP - init_aggregate(); init_defrootdomain(); #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03b9fbd9d64..7b8d664d6f2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct task_group *tg; rcu_read_lock(); + update_h_load(busiest_cpu); + list_for_each_entry(tg, &task_groups, list) { + struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; long rem_load, moved_load; /* * empty group */ - if (!tg->cfs_rq[busiest_cpu]->task_weight) + if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; - rem_load /= aggregate(tg, this_cpu)->load + 1; + rem_load = rem_load_move * busiest_cfs_rq->load.weight; + rem_load /= busiest_cfs_rq->h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - - moved_load *= aggregate(tg, this_cpu)->load; - moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; + moved_load *= busiest_cfs_rq->h_load; + moved_load /= busiest_cfs_rq->load.weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit v1.2.3 From 3e5459b4bea3ca2618cc02d56d12639f2cba531d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:24 +0200 Subject: sched: fix newidle smp group balancing Re-compute the shares on newidle - so we can make a decision based on recent data. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f864b751fd1..cdd09462fc9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1579,6 +1579,13 @@ static void update_shares(struct sched_domain *sd) walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + static void update_h_load(int cpu) { walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); @@ -1595,6 +1602,10 @@ static inline void update_shares(struct sched_domain *sd) { } +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + #endif #endif @@ -3543,6 +3554,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: + update_shares_locked(this_rq, sd); group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, cpus, NULL); if (!group) { @@ -3586,6 +3598,7 @@ redo: } else sd->nr_balance_failed = 0; + update_shares_locked(this_rq, sd); return ld_moved; out_balanced: -- cgit v1.2.3 From 039a1c41b3a489e34593ea1e1687f6fdad6b13ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:25 +0200 Subject: sched: fix sched_balance_self() smp group balancing Finding the least idle cpu is more accurate when done with updated shares. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cdd09462fc9..39d5495540d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2128,6 +2128,9 @@ static int sched_balance_self(int cpu, int flag) sd = tmp; } + if (sd) + update_shares(sd); + while (sd) { cpumask_t span, tmpmask; struct sched_group *group; -- cgit v1.2.3 From a8a51d5e59561aa5b4d66e19eca819b537783e8f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:26 +0200 Subject: sched: persistent average load per task Remove the fall-back to SCHED_LOAD_SCALE by remembering the previous value of cpu_avg_load_per_task() - this is useful because of the hierarchical group model in which task weight can be much smaller. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 39d5495540d..6a6b0139eb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -554,6 +554,8 @@ struct rq { int cpu; int online; + unsigned long avg_load_per_task; + struct task_struct *migration_thread; struct list_head migration_queue; #endif @@ -1427,9 +1429,18 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + #ifdef CONFIG_FAIR_GROUP_SCHED typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); @@ -2010,18 +2021,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -/* - * Return the average load per task on the cpu's run queue - */ -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - unsigned long n = rq->nr_running; - - return n ? total / n : SCHED_LOAD_SCALE; -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. -- cgit v1.2.3 From bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:27 +0200 Subject: sched: hierarchical load vs affine wakeups With hierarchical grouping we can't just compare task weight to rq weight - we need to scale the weight appropriately. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7b8d664d6f2..865cb53a7cc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1073,6 +1073,25 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; +#ifdef CONFIG_FAIR_GROUP_SCHED +static unsigned long task_h_load(struct task_struct *p) +{ + unsigned long h_load = p->se.load.weight; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + update_h_load(task_cpu(p)); + + h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + + return h_load; +} +#else +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; +} +#endif + static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, @@ -1093,9 +1112,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= current->se.load.weight; + tl -= task_h_load(current); - balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + balanced = 100*(tl + task_h_load(p)) <= imbalance*load; /* * If the currently running task will sleep within -- cgit v1.2.3 From 408ed066b11cf9ee4536573b4269ee3613bd735e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:28 +0200 Subject: sched: hierarchical load vs find_busiest_group find_busiest_group() has some assumptions about task weight being in the NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this by replacing it with the average task weight, which will adapt the situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6a6b0139eb3..5e2aa394a81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; else if (idle == CPU_NEWLY_IDLE) @@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; local_group = cpu_isset(this_cpu, group->cpumask); @@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; min_cpu_load = ~0UL; @@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); - if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) __group_imb = 1; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; @@ -3267,9 +3287,9 @@ small_imbalance: if (busiest_load_per_task > this_load_per_task) imbn = 1; } else - this_load_per_task = SCHED_LOAD_SCALE; + this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; -- cgit v1.2.3 From 42a3ac7d5cee89849448b41b86faeb86f98e92f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:29 +0200 Subject: sched: fix load scaling in group balancing doing the load balance will change cfs_rq->load.weight (that's the whole point) but since that's part of the scale factor, we'll scale back with a different amount. Weight getting smaller would result in an inflated moved_load which causes it to stop balancing too soon. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 865cb53a7cc..734e4c556fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1444,6 +1444,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, list_for_each_entry(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; long rem_load, moved_load; /* @@ -1452,8 +1454,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_cfs_rq->load.weight; - rem_load /= busiest_cfs_rq->h_load + 1; + rem_load = rem_load_move * busiest_weight; + rem_load /= busiest_h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1462,8 +1464,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - moved_load *= busiest_cfs_rq->h_load; - moved_load /= busiest_cfs_rq->load.weight + 1; + moved_load *= busiest_h_load; + moved_load /= busiest_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit v1.2.3 From 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:30 +0200 Subject: sched: fix task_h_load() Currently task_h_load() computes the load of a task and uses that to either subtract it from the total, or add to it. However, removing or adding a task need not have any effect on the total load at all. Imagine adding a task to a group that is local to one cpu - in that case the total load of that cpu is unaffected. So properly compute addition/removal: s_i = S * rw_i / \Sum_j rw_j s'_i = S * (rw_i + wl) / (\Sum_j rw_j + wg) then s'_i - s_i gives the change in load. Where s_i is the shares for cpu i, S the group weight, rw_i the runqueue weight for that cpu, wl the weight we add (subtract) and wg the weight contribution to the runqueue. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 734e4c556fc..a1694441f8b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,22 +1074,53 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long task_h_load(struct task_struct *p) +static unsigned long effective_load(struct task_group *tg, long wl, int cpu) { - unsigned long h_load = p->se.load.weight; - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + struct sched_entity *se = tg->se[cpu]; + long wg = wl; - update_h_load(task_cpu(p)); + for_each_sched_entity(se) { +#define D(n) (likely(n) ? (n) : 1) + + long S, Srw, rw, s, sn; + + S = se->my_q->tg->shares; + s = se->my_q->shares; + rw = se->my_q->load.weight; - h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + Srw = S * rw / D(s); + sn = S * (rw + wl) / D(Srw + wg); + + wl = sn - s; + wg = 0; +#undef D + } - return h_load; + return wl; } + +static unsigned long task_load_sub(struct task_struct *p) +{ + return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) +{ + return effective_load(task_group(p), p->se.load.weight, cpu); +} + #else -static unsigned long task_h_load(struct task_struct *p) + +static unsigned long task_load_sub(struct task_struct *p) +{ + return -p->se.load.weight; +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) { return p->se.load.weight; } + #endif static int @@ -1112,9 +1143,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= task_h_load(current); + tl += task_load_sub(current); - balanced = 100*(tl + task_h_load(p)) <= imbalance*load; + balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; /* * If the currently running task will sleep within -- cgit v1.2.3 From 051c67640e771fd6ad1b676fc0c16c379b3c6f80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:31 +0200 Subject: sched: remove prio preference from balance decisions Priority looses much of its meaning in a hierarchical context. So don't use it in balance decisions. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5e2aa394a81..10d43f5bf0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2896,7 +2896,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { - int loops = 0, pulled = 0, pinned = 0, skip_for_load; + int loops = 0, pulled = 0, pinned = 0; struct task_struct *p; long rem_load_move = max_load_move; @@ -2912,14 +2912,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, next: if (!p || loops++ > sysctl_sched_nr_migrate) goto out; - /* - * To help distribute high priority tasks across CPUs we don't - * skip a task if it will be the highest priority task (i.e. smallest - * prio value) on its new queue regardless of its load weight - */ - skip_for_load = (p->se.load.weight >> 1) > rem_load_move + - SCHED_LOAD_SCALE_FUZZ; - if ((skip_for_load && p->prio >= *this_best_prio) || + + if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { p = iterator->next(iterator->arg); goto next; -- cgit v1.2.3 From cb5ef42a03a13f95a9ea94e6cda4f7a47497871f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:32 +0200 Subject: sched: optimize effective_load() s_i = S * rw_i / \Sum_j rw_j -> \Sum_j rw_j = S * rw_i / s_i -> s'_i = S * (rw_i + w) / (\Sum_j rw_j + w) delta s = s' - s = S * (rw + w) / ((S * rw / s) + w) = s * (S * (rw + w) / (S * rw + s * w) - 1) a = S*(rw+w), b = S*rw + s*w delta s = s * (a-b) / b IOW, trade one divide for two multiplies Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a1694441f8b..0d197be3e3e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1082,16 +1082,16 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) - long S, Srw, rw, s, sn; + long S, rw, s, a, b; S = se->my_q->tg->shares; s = se->my_q->shares; rw = se->my_q->load.weight; - Srw = S * rw / D(s); - sn = S * (rw + wl) / D(Srw + wg); + a = S*(rw + wl); + b = S*rw + s*wg; - wl = sn - s; + wl = s*(a-b)/D(b); wg = 0; #undef D } -- cgit v1.2.3 From 93b75217df39e6d75889cc6f8050343286aff4a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:33 +0200 Subject: sched: disable source/target_load bias The bias given by source/target_load functions can be very large, disable it by default to get faster convergence. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- kernel/sched_features.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 10d43f5bf0f..6c5eb3bc37e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2000,7 +2000,7 @@ static unsigned long source_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return min(rq->cpu_load[type-1], total); @@ -2015,7 +2015,7 @@ static unsigned long target_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return max(rq->cpu_load[type-1], total); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 04123af2e67..d56e3053e74 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,3 +8,4 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) +SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file -- cgit v1.2.3 From cd80917e4ff465ea77106f8e4fb631eedc4cf426 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:34 +0200 Subject: sched: fix shares boost logic In case the domain is empty, pretend there is a single task on each cpu, so that together with the boost logic we end up giving 1/n shares to each cpu. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c5eb3bc37e..1cff969f664 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1549,6 +1549,9 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; + if (!rq_weight) + rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; + for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; -- cgit v1.2.3 From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:35 +0200 Subject: sched: update shares on wakeup We found that the affine wakeup code needs rather accurate load figures to be effective. The trouble is that updating the load figures is fairly expensive with group scheduling. Therefore ratelimit the updating. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 +++++++++++++++++++++++++++++- kernel/sched_features.h | 3 ++- kernel/sysctl.c | 8 ++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1cff969f664..62db0891025 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -777,6 +777,12 @@ late_initcall(sched_init_debug); */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * ratelimit for updating the group shares. + * default: 0.5ms + */ +const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) static void update_shares(struct sched_domain *sd) { - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + } } static void update_shares_locked(struct rq *rq, struct sched_domain *sd) @@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + update_shares(sd); + break; + } + } + } +#endif + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d56e3053e74..7d616d2a2a3 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file +SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_WAKEUP_UPDATE, 1) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..fe8cdc80ff0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_ratelimit", + .data = &sysctl_sched_shares_ratelimit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", -- cgit v1.2.3 From 243e0e7b7d3b54749ece2e879ecd7e2a11874443 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:36 +0200 Subject: sched: fix mult overflow It was observed these mults can overflow. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0d197be3e3e..26ebe180cde 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1477,7 +1477,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; - long rem_load, moved_load; + u64 rem_load, moved_load; /* * empty group @@ -1485,8 +1485,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_weight; - rem_load /= busiest_h_load + 1; + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1496,7 +1496,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, continue; moved_load *= busiest_h_load; - moved_load /= busiest_weight + 1; + moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit v1.2.3 From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:37 +0200 Subject: sched: correct wakeup weight calculations rw_i = {2, 4, 1, 0} s_i = {2/7, 4/7, 1/7, 0} wakeup on cpu0, weight=1 rw'_i = {3, 4, 1, 0} s'_i = {3/8, 4/8, 1/8, 0} s_0 = S * rw_0 / \Sum rw_j -> \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct) s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) = 1 * (2+1) / (7+1) = 3/8 (correct so we find that adding 1 to cpu0 gains 5/56 in weight if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 48 ++++++++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 62db0891025..01d3e51b711 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} #endif /* CONFIG_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 26ebe180cde..bed2f71e63d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, long wl, int cpu) +static unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { struct sched_entity *se = tg->se[cpu]; - long wg = wl; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) b = S*rw + s*wg; wl = s*(a-b)/D(b); + /* + * Assume the group is already running and will + * thus already be accounted for in the weight. + * + * That is, moving shares between CPUs, does not + * alter the group weight. + */ wg = 0; #undef D } @@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) return wl; } -static unsigned long task_load_sub(struct task_struct *p) -{ - return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return effective_load(task_group(p), p->se.load.weight, cpu); -} - #else -static unsigned long task_load_sub(struct task_struct *p) +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { - return -p->se.load.weight; -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return p->se.load.weight; + return wl; } #endif @@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, unsigned int imbalance) { struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) @@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * effect of the currently running task from the load * of the current CPU: */ - if (sync) - tl += task_load_sub(current); + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + tl += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; - balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; + balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within -- cgit v1.2.3 From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:38 +0200 Subject: sched: incremental effective_load() Increase the accuracy of the effective_load values. Not only consider the current increment (as per the attempted wakeup), but also consider the delta between when we last adjusted the shares and the current situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 01d3e51b711..7613f69f097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -427,6 +427,11 @@ struct cfs_rq { * this cpu's part of tg->shares */ unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; #endif #endif }; @@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * record the actual number of shares, not the boosted amount. */ tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bed2f71e63d..e87f1a52f62 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, + long wl, long wg) { struct sched_entity *se = tg->se[cpu]; + long more_w; + + if (!tg->parent) + return wl; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu, S = se->my_q->tg->shares; s = se->my_q->shares; - rw = se->my_q->load.weight; + rw = se->my_q->rq_weight; a = S*(rw + wl); b = S*rw + s*wg; -- cgit v1.2.3 From f5bfb7d9ff73d72ee4f2f4830a6f0c9088d00f92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:39 +0200 Subject: sched: bias effective_load() error towards failing wake_affine(). Measurement shows that the difference between cgroup:/ and cgroup:/foo wake_affine() results is that the latter succeeds significantly more. Therefore bias the calculations towards failing the test. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 ++++++++++++++++++++++++++++ kernel/sched_features.h | 1 + 2 files changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e87f1a52f62..9bcc0030a58 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,6 +1074,27 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * The problem is that perfectly aligning the shares is rather expensive, hence + * we try to avoid doing that too often - see update_shares(), which ratelimits + * this change. + * + * We compensate this by not only taking the current delta into account, but + * also considering the delta between when the shares were last adjusted and + * now. + * + * We still saw a performance dip, some tracing learned us that between + * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased + * significantly. Therefore try to bias the error in direction of failing + * the affine wakeup. + * + */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { @@ -1083,6 +1104,13 @@ static long effective_load(struct task_group *tg, int cpu, if (!tg->parent) return wl; + /* + * By not taking the decrease of shares on the other cpu into + * account our error leans towards reducing the affine wakeups. + */ + if (!wl && sched_feat(ASYM_EFF_LOAD)) + return wl; + /* * Instead of using this increment, also add the difference * between when the shares were last updated and now. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7d616d2a2a3..862b06bd560 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -10,3 +10,4 @@ SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 0) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(ASYM_EFF_LOAD, 1) -- cgit v1.2.3 From 55e12e5e7b1d7e7c05a4be10cb5fd092c039aa78 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 24 Jun 2008 23:39:43 +0530 Subject: sched: make sched_{rt,fair}.c ifdefs more readable Signed-off-by: Dhaval Giani Cc: Srivatsa Vaddagiri Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- kernel/sched_rt.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9bcc0030a58..2e43d4a748c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -921,7 +921,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta, requeue); } } -#else +#else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -1062,7 +1062,7 @@ static int wake_idle(int cpu, struct task_struct *p) } return cpu; } -#else +#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; @@ -1586,7 +1586,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -#endif +#endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 765932d0399..47ceac9e855 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -161,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { @@ -226,7 +226,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &def_rt_bandwidth; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) @@ -374,12 +374,12 @@ static int balance_runtime(struct rt_rq *rt_rq) return more; } -#else +#else /* !CONFIG_SMP */ static inline int balance_runtime(struct rt_rq *rt_rq) { return 0; } -#endif +#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -1472,4 +1472,4 @@ static void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif +#endif /* CONFIG_SCHED_DEBUG */ -- cgit v1.2.3 From 4c9fe8ad813b257a2b9ddf0f752105a75a7dae63 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 14:49:35 +0200 Subject: sched: export cpu_clock the rcutorture module relies on cpu_clock. Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 3c696db5945..ed5a8c41504 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -256,3 +256,4 @@ unsigned long long cpu_clock(int cpu) return clock; } +EXPORT_SYMBOL_GPL(cpu_clock); -- cgit v1.2.3 From ad118c54a3587b2c69a769d0ba37d4d8dce4559d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Jun 2008 18:04:48 +0200 Subject: stacktrace: add saved stack traces to backtrace self-test This patch adds saved stack-traces to the backtrace suite of self-tests. Note that we don't depend on or unconditionally enable CONFIG_STACKTRACE because not all architectures may have it (and we still want to enable the other tests for those architectures). Cc: Arjan van de Ven Signed-off-by: Vegard Nossum Cc: Arjan van de Ven Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/backtracetest.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index d1a7605c5b8..50f7abd0813 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -10,9 +10,10 @@ * of the License. */ +#include #include #include -#include +#include static struct timer_list backtrace_timer; @@ -22,6 +23,31 @@ static void backtrace_test_timer(unsigned long data) printk("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } + +#ifdef CONFIG_STACKTRACE +static void backtrace_test_saved(void) +{ + struct stack_trace trace; + unsigned long entries[8]; + + printk("Testing a saved backtrace.\n"); + printk("The following trace is a kernel self test and not a bug!\n"); + + trace.nr_entries = 0; + trace.max_entries = ARRAY_SIZE(entries); + trace.entries = entries; + trace.skip = 0; + + save_stack_trace(&trace); + print_stack_trace(&trace, 0); +} +#else +static void backtrace_test_saved(void) +{ + printk("Saved backtrace test skipped.\n"); +} +#endif + static int backtrace_regression_test(void) { printk("====[ backtrace testing ]===========\n"); @@ -29,6 +55,8 @@ static int backtrace_regression_test(void) printk("The following trace is a kernel self test and not a bug!\n"); dump_stack(); + backtrace_test_saved(); + init_timer(&backtrace_timer); backtrace_timer.function = backtrace_test_timer; mod_timer(&backtrace_timer, jiffies + 10); -- cgit v1.2.3 From 4e6a0535dd036377961027262aecb138099f925d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Jun 2008 18:06:54 +0200 Subject: backtrace: replace timer with tasklet + completions On qemu, the backtrace would show up _after_ the "end of backtrace testing" message. This patch changes it to use completions instead, which will guarantee that no such race exists. Cc: Arjan van de Ven Signed-off-by: Vegard Nossum Cc: Arjan van de Ven Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/backtracetest.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 50f7abd0813..a5e026bc45c 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -10,18 +10,39 @@ * of the License. */ +#include #include +#include #include #include #include -static struct timer_list backtrace_timer; +static void backtrace_test_normal(void) +{ + printk("Testing a backtrace from process context.\n"); + printk("The following trace is a kernel self test and not a bug!\n"); + + dump_stack(); +} + +static DECLARE_COMPLETION(backtrace_work); + +static void backtrace_test_irq_callback(unsigned long data) +{ + dump_stack(); + complete(&backtrace_work); +} + +static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); -static void backtrace_test_timer(unsigned long data) +static void backtrace_test_irq(void) { printk("Testing a backtrace from irq context.\n"); printk("The following trace is a kernel self test and not a bug!\n"); - dump_stack(); + + init_completion(&backtrace_work); + tasklet_schedule(&backtrace_tasklet); + wait_for_completion(&backtrace_work); } #ifdef CONFIG_STACKTRACE @@ -51,17 +72,11 @@ static void backtrace_test_saved(void) static int backtrace_regression_test(void) { printk("====[ backtrace testing ]===========\n"); - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - dump_stack(); + backtrace_test_normal(); + backtrace_test_irq(); backtrace_test_saved(); - init_timer(&backtrace_timer); - backtrace_timer.function = backtrace_test_timer; - mod_timer(&backtrace_timer, jiffies + 10); - - msleep(10); printk("====[ end of backtrace testing ]====\n"); return 0; } -- cgit v1.2.3 From 79c537998d143b127c8c662a403c3356cb885f1c Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Sun, 29 Jun 2008 00:16:56 +0200 Subject: sched: fix cpu hotplug the CPU hotplug problems (crashes under high-volume unplug+replug tests) seem to be related to migrate_dead_tasks(). Firstly I added traces to see all tasks being migrated with migrate_live_tasks() and migrate_dead_tasks(). On my setup the problem pops up (the one with "se == NULL" in the loop of pick_next_task_fair()) shortly after the traces indicate that some has been migrated with migrate_dead_tasks()). btw., I can reproduce it much faster now with just a plain cpu down/up loop. [disclaimer] Well, unless I'm really missing something important in this late hour [/desclaimer] pick_next_task() is not something appropriate for migrate_dead_tasks() :-) the following change seems to eliminate the problem on my setup (although, I kept it running only for a few minutes to get a few messages indicating migrate_dead_tasks() does move tasks and the system is still ok) Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3aaa5c8cb42..a66e85639de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5887,6 +5887,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) next = pick_next_task(rq, rq->curr); if (!next) break; + next->sched_class->put_prev_task(rq, next); migrate_dead(dead_cpu, next); } -- cgit v1.2.3 From 2d452c9b10caeec455eb5e56a0ef4ed485178213 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Jun 2008 15:01:59 +0200 Subject: sched: sched_clock_cpu() based cpu_clock(), lockdep fix Vegard Nossum reported: > WARNING: at kernel/lockdep.c:2738 check_flags+0x142/0x160() which happens due to: unsigned long long cpu_clock(int cpu) { unsigned long long clock; unsigned long flags; raw_local_irq_save(flags); as lower level functions can take locks, we must not do that, use proper lockdep-annotated irq save/restore. Reported-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ed5a8c41504..60094e257a9 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -250,9 +250,9 @@ unsigned long long cpu_clock(int cpu) unsigned long long clock; unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); clock = sched_clock_cpu(cpu); - raw_local_irq_restore(flags); + local_irq_restore(flags); return clock; } -- cgit v1.2.3 From 34e83e850f5e5ee2a18cd77a5d70d31972a632e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 15:42:36 +0200 Subject: sched: build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/sched.c: In function ‘sched_group_set_shares': kernel/sched.c:8635: error: implicit declaration of function ‘cfs_rq_set_shares' Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7613f69f097..058250a63b6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1627,11 +1627,6 @@ static void update_h_load(int cpu) walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ - cfs_rq->shares = shares; -} - #else static inline void update_shares(struct sched_domain *sd) @@ -1646,6 +1641,13 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + cfs_rq->shares = shares; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" -- cgit v1.2.3 From 30432094a7f506ad24997a3ba6aed913ab61c01d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Jun 2008 21:35:50 +0200 Subject: sched: fix warning This patch fixes the following warning: kernel/sched.c:1667: warning: 'cfs_rq_set_shares' defined but not used This seems the correct way to fix this; cfs_rq_set_shares() is only used in a single place, which is also inside #ifdef CONFIG_FAIR_GROUP_SCHED. Signed-off-by: Vegard Nossum Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 058250a63b6..677c80b9a6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1641,12 +1641,14 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_FAIR_GROUP_SCHED static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) { -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP cfs_rq->shares = shares; #endif } +#endif #include "sched_stats.h" #include "sched_idletask.c" -- cgit v1.2.3 From 8594698ebddeef5443b7da8258ae33b3eaca61d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 21:20:17 +0200 Subject: stacktrace: fix modular build, export print_stack_trace and save_stack_trace fix: ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined! ERROR: "save_stack_trace" [kernel/backtracetest.ko] undefined! Signed-off-by: Ingo Molnar and fix: Building modules, stage 2. MODPOST 376 modules ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined! make[1]: *** [__modpost] Error 1 Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 7eaea9d02a5..94b527ef1d1 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar */ #include +#include #include #include @@ -21,4 +22,5 @@ void print_stack_trace(struct stack_trace *trace, int spaces) print_ip_sym(trace->entries[i]); } } +EXPORT_SYMBOL_GPL(print_stack_trace); -- cgit v1.2.3 From 619b0488038224391e64fa03854651ca0f5efe56 Mon Sep 17 00:00:00 2001 From: Raistlin Date: Thu, 26 Jun 2008 18:54:09 +0200 Subject: sched: fix divide error when trying to configure rt_period to zero Here it is another little Oops we found while configuring invalid values via cgroups: echo 0 > /dev/cgroups/0/cpu.rt_period_us or echo 4294967296 > /dev/cgroups/0/cpu.rt_period_us [ 205.509825] divide error: 0000 [#1] [ 205.510151] Modules linked in: [ 205.510151] [ 205.510151] Pid: 2339, comm: bash Not tainted (2.6.26-rc8 #33) [ 205.510151] EIP: 0060:[] EFLAGS: 00000293 CPU: 0 [ 205.510151] EIP is at div64_u64+0x5f/0x70 [ 205.510151] EAX: 0000389f EBX: 00000000 ECX: 00000000 EDX: 00000000 [ 205.510151] ESI: d9800000 EDI: 00000000 EBP: c6cede60 ESP: c6cede50 [ 205.510151] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 [ 205.510151] Process bash (pid: 2339, ti=c6cec000 task=c79be370 task.ti=c6cec000) [ 205.510151] Stack: d9800000 0000389f c05971a0 d9800000 c6cedeb4 c0214dbd 00000000 00000000 [ 205.510151] c6cede88 c0242bd8 c05377c0 c7a41b40 00000000 00000000 00000000 c05971a0 [ 205.510151] c780ed20 c7508494 c7a41b40 00000000 00000002 c6cedebc c05971a0 ffffffea [ 205.510151] Call Trace: [ 205.510151] [] ? __rt_schedulable+0x1cd/0x240 [ 205.510151] [] ? cgroup_file_open+0x18/0xe0 [ 205.510151] [] ? tg_set_bandwidth+0xa4/0xf0 [ 205.510151] [] ? sched_group_set_rt_period+0x36/0x50 [ 205.510151] [] ? cpu_rt_period_write_uint+0xe/0x10 [ 205.510151] [] ? cgroup_file_write+0x125/0x160 [ 205.510151] [] ? hrtimer_interrupt+0x155/0x190 [ 205.510151] [] ? security_file_permission+0xf/0x20 [ 205.510151] [] ? rw_verify_area+0x48/0xc0 [ 205.510151] [] ? dupfd+0x104/0x130 [ 205.510151] [] ? vfs_write+0x9c/0x160 [ 205.510151] [] ? cgroup_file_write+0x0/0x160 [ 205.510151] [] ? sys_write+0x3d/0x70 [ 205.510151] [] ? sysenter_past_esp+0x6a/0x91 [ 205.510151] ======================= [ 205.510151] Code: 0f 45 de 31 f6 0f ad d0 d3 ea f6 c1 20 0f 45 c2 0f 45 d6 89 45 f0 89 55 f4 8b 55 f4 31 c9 8b 45 f0 39 d3 89 c6 77 08 89 d0 31 d2 f3 89 c1 83 c4 08 89 f0 f7 f3 89 ca 5b 5e 5d c3 55 89 e5 56 [ 205.510151] EIP: [] div64_u64+0x5f/0x70 SS:ESP 0068:c6cede50 The attached patch solves the issue for me. I'm checking as soon as possible for the period not being zero since, if it is, going ahead is useless. This way we also save a mutex_lock() and a read_lock() wrt doing it inside tg_set_bandwidth() or __rt_schedulable(). Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a66e85639de..94ead43eda6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8502,6 +8502,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; + if (rt_period == 0) + return -EINVAL; + return tg_set_bandwidth(tg, rt_period, rt_runtime); } -- cgit v1.2.3 From 8558f8f81680a43d383abd1b5f23d3501fedfa65 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Fri, 27 Jun 2008 10:17:38 +0530 Subject: rcu: fix hotplug vs rcu race Dhaval Giani reported this warning during cpu hotplug stress-tests: | On running kernel compiles in parallel with cpu hotplug: | | WARNING: at arch/x86/kernel/smp.c:118 | native_smp_send_reschedule+0x21/0x36() | Modules linked in: | Pid: 27483, comm: cc1 Not tainted 2.6.26-rc7 #1 | [...] | [] native_smp_send_reschedule+0x21/0x36 | [] force_quiescent_state+0x47/0x57 | [] call_rcu+0x51/0x6d | [] __fput+0x130/0x158 | [] fput+0x17/0x19 | [] filp_close+0x4d/0x57 | [] sys_close+0x5c/0x97 IMHO the warning is a spurious one. cpu_online_map is updated by the _cpu_down() using stop_machine_run(). Since force_quiescent_state is invoked from irqs disabled section, stop_machine_run() won't be executing while a cpu is executing force_quiescent_state(). Hence the cpu_online_map is stable while we're in the irq disabled section. However, a cpu might have been offlined _just_ before we disabled irqs while entering force_quiescent_state(). And rcu subsystem might not yet have handled the CPU_DEAD notification, leading to the offlined cpu's bit being set in the rcp->cpumask. Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent sending smp_reschedule() to an offlined CPU. Here's the timeline: CPU_A CPU_B -------------------------------------------------------------- cpu_down(): . . . . . stop_machine(): /* disables preemption, . * and irqs */ . . . . . take_cpu_down(); . . . . . . . cpu_disable(); /*this removes cpu . *from cpu_online_map . */ . . . . . restart_machine(); /* enables irqs */ . ------WINDOW DURING WHICH rcp->cpumask is stale --------------- . call_rcu(); . /* disables irqs here */ . .force_quiescent_state(); .CPU_DEAD: .for_each_cpu(rcp->cpumask) . . smp_send_reschedule(); . . . . WARN_ON() for offlined CPU! . . . rcu_cpu_notify: . -------- WINDOW ENDS ------------------------------------------ rcu_offline_cpu() /* Which calls cpu_quiet() * which removes * cpu from rcp->cpumask. */ If a new batch was started just before calling stop_machine_run(), the "tobe-offlined" cpu is still present in rcp-cpumask. During a cpu-offline, from take_cpu_down(), we queue an rt-prio idle task as the next task to be picked by the scheduler. We also call cpu_disable() which will disable any further interrupts and remove the cpu's bit from the cpu_online_map. Once the stop_machine_run() successfully calls take_cpu_down(), it calls schedule(). That's the last time a schedule is called on the offlined cpu, and hence the last time when rdp->passed_quiesc will be set to 1 through rcu_qsctr_inc(). But the cpu_quiet() will be on this cpu will be called only when the next RCU_SOFTIRQ occurs on this CPU. So at this time, the offlined CPU is still set in rcp->cpumask. Now coming back to the idle_task which truely offlines the CPU, it does check for a pending RCU and raises the softirq, since it will find rdp->passed_quiesc to be 0 in this case. However, since the cpu is offline I am not sure if the softirq will trigger on the CPU. Even if it doesn't the rcu_offline_cpu() will find that rcp->completed is not the same as rcp->cur, which means that our cpu could be holding up the grace period progression. Hence we call cpu_quiet() and move ahead. But because of the window explained in the timeline, we could still have a call_rcu() before the RCU subsystem executes it's CPU_DEAD notification, and we send smp_send_reschedule() to offlined cpu while trying to force the quiescent states. The appended patch adds comments and prevents checking for offlined cpu everytime. cpu_online_map is updated by the _cpu_down() using stop_machine_run(). Since force_quiescent_state is invoked from irqs disabled section, stop_machine_run() won't be executing while a cpu is executing force_quiescent_state(). Hence the cpu_online_map is stable while we're in the irq disabled section. Reported-by: Dhaval Giani Signed-off-by: Gautham R Shenoy Acked-by: Dhaval Giani Cc: Dipankar Sarma Cc: laijs@cn.fujitsu.com Cc: Peter Zijlstra Cc: Rusty Russel Cc: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..a38895a5b8e 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp, /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. + * + * cpu_online_map is updated by the _cpu_down() + * using stop_machine_run(). Since we're in irqs disabled + * section, stop_machine_run() is not exectuting, hence + * the cpu_online_map is stable. + * + * However, a cpu might have been offlined _just_ before + * we disabled irqs while entering here. + * And rcu subsystem might not yet have handled the CPU_DEAD + * notification, leading to the offlined cpu's bit + * being set in the rcp->cpumask. + * + * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * sending smp_reschedule() to an offlined CPU. */ - cpumask = rcp->cpumask; + cpus_and(cpumask, rcp->cpumask, cpu_online_map); cpu_clear(rdp->cpu, cpumask); for_each_cpu_mask(cpu, cpumask) smp_send_reschedule(cpu); -- cgit v1.2.3 From 3e61e0c976532a542b23bbb74c8f631815171078 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 30 Jun 2008 23:48:37 +0300 Subject: mmiotrace broken in linux-next (8-bit writes only) The moment mmiotrace is enabled, I hit a NULL deref in: IP: [] __trace_special+0x17c/0x23a Call Trace: [] ftrace_special+0x6f/0x9a [] down+0x19/0x4a [] acquire_console_sem+0x42/0x58 [] con_flush_chars+0x28/0x43 [] write_chan+0x22e/0x334 [] ? default_wake_function+0x0/0xf [] tty_write+0x195/0x228 [] ? write_chan+0x0/0x334 [] vfs_write+0xae/0x137 [] sys_write+0x47/0x70 [] system_call_after_swapgs+0x7b/0x80 which means 'entry' in __trace_special() is NULL. [ mingo@elte.hu: that ftrace_special() was a leftover. ] Signed-off-by: Pekka Paalanen Cc: Steven Rostedt Cc: proski@gnu.org Cc: "Vegard Nossum" Signed-off-by: Ingo Molnar --- kernel/semaphore.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 1a064adab65..aaaeae8244e 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -54,7 +54,6 @@ void down(struct semaphore *sem) { unsigned long flags; - ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; -- cgit v1.2.3 From db26e64dc3f0d51c4db1a625c248a81f7850eee9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 May 2008 19:16:33 +0200 Subject: pm_qos_params: BKL pushdown [jmc: added ] Signed-off-by: Arnd Bergmann Signed-off-by: Jonathan Corbet --- kernel/pm_qos_params.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0afe32be4c8..8cb75702638 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -358,15 +359,19 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) int ret; long pm_qos_class; + lock_kernel(); pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { filp->private_data = (void *)pm_qos_class; sprintf(name, "process_%d", current->pid); ret = pm_qos_add_requirement(pm_qos_class, name, PM_QOS_DEFAULT_VALUE); - if (ret >= 0) + if (ret >= 0) { + unlock_kernel(); return 0; + } } + unlock_kernel(); return -EPERM; } -- cgit v1.2.3 From da9cbc87395308a21465bd25441297bbba0477e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2008 20:42:08 +0200 Subject: block: blkdev.h cleanup, move iocontext stuff to iocontext.h Signed-off-by: Jens Axboe --- kernel/exit.c | 1 + kernel/fork.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e69b6..ceb25878283 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..b71ccd09fc8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 98a05ed4bd7774f533ab185fe0bf2fdc58292d7c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Thu, 26 Jun 2008 22:51:51 +0530 Subject: ftrace: prevent ftrace modifications while being kprobe'd, v2 add two missing chunks for ftrace+kprobe. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 85e84133541..0f271c45cd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -502,8 +502,12 @@ static void ftrace_replace_code(int enable) continue; /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) + if (get_kprobe((void *)rec->ip)) { + freeze_record(rec); continue; + } else { + unfreeze_record(rec); + } failed = __ftrace_replace_code(rec, old, new, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { @@ -740,7 +744,10 @@ static int __ftrace_update_code(void *ignore) ftrace_del_hash(p); INIT_HLIST_NODE(&p->node); hlist_add_head(&p->node, &temp_list); + freeze_record(p); continue; + } else { + unfreeze_record(p); } /* convert record (i.e, patch mcount-call with NOP) */ -- cgit v1.2.3 From ee3ece830f6db9837f7ac67008f532a8c1e755f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Jul 2008 14:31:26 -0400 Subject: hrtimer: prevent migration for raising softirq Due to a possible deadlock, the waking of the softirq was pushed outside of the hrtimer base locks. See commit 0c96c5979a522c3323c30a078a70120e29b5bdbc Unfortunately this allows the task to migrate after setting up the softirq and raising it. Since softirqs run a queue that is per-cpu we may raise the softirq on the wrong CPU and this will keep the queued softirq task from running. To solve this issue, this patch disables preemption around the releasing of the hrtimer lock and raising of the softirq. Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..ab80515008f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1003,10 +1003,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) */ raise = timer->state == HRTIMER_STATE_PENDING; + /* + * We use preempt_disable to prevent this task from migrating after + * setting up the softirq and raising it. Otherwise, if me migrate + * we will raise the softirq on the wrong CPU. + */ + preempt_disable(); + unlock_hrtimer_base(timer, &flags); if (raise) hrtimer_raise_softirq(); + preempt_enable(); return ret; } -- cgit v1.2.3 From c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:29:55 -0600 Subject: sched: terminate newidle balancing once at least one task has moved over Inspired by Peter Zijlstra. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 677c80b9a6b..d99aeabeb72 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3013,6 +3013,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + } while (class && max_load_move > total_load_moved); return total_load_moved > 0; -- cgit v1.2.3 From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:30:00 -0600 Subject: sched: add avg-overlap support to RT tasks We have the notion of tracking process-coupling (a.k.a. buddy-wake) via the p->se.last_wake / p->se.avg_overlap facilities, but it is only used for cfs to cfs interactions. There is no reason why an rt to cfs interaction cannot share in establishing a relationhip in a similar manner. Because PREEMPT_RT runs many kernel threads as FIFO priority, we often times have heavy interaction between RT threads waking CFS applications. This patch offers a substantial boost (50-60%+) in perfomance under those circumstances. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++++ kernel/sched_fair.c | 21 ++------------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d99aeabeb72..bbc40c3a065 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); @@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -2313,6 +2325,8 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: + current->se.last_wakeup = current->se.sum_exec_runtime; + task_rq_unlock(rq, &flags); return success; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e43d4a748c..f2aa987027d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (!se->last_wakeup) - return; - - update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); - se->last_wakeup = 0; -} - static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced && curr->sched_class == &fair_sched_class) { + if (sync && balanced) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) + p->se.avg_overlap < sysctl_sched_migration_cost) return 1; } @@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - se->last_wakeup = se->sum_exec_runtime; if (unlikely(se == pse)) return; -- cgit v1.2.3 From 46ac22bab42cc868b9c1d0e915ddbc8e8065a44d Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Tue, 1 Jul 2008 14:30:06 +0530 Subject: sched: fix accounting in task delay accounting & migration On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote: > On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote: > > > Thanks Peter for the explanation... > > > > I agree with the above and that is the reason why I did not see weird > > values with cpu_time. But, run_delay still would suffer skews as the end > > points for delta could be taken on different cpus due to migration (more > > so on RT kernel due to the push-pull operations). With the below patch, > > I could not reproduce the issue I had seen earlier. After every dequeue, > > we take the delta and start wait measurements from zero when moved to a > > different rq. > > OK, so task delay delay accounting is broken because it doesn't take > migration into account. > > What you've done is make it symmetric wrt enqueue, and account it like > > cpu0 cpu1 > > enqueue > > dequeue > enqueue > > run > > Where you add both d1 and d2 to the run_delay,.. right? > Thanks for reviewing the patch. The above is exactly what I have done. > This seems like a good fix, however it looks like the patch will break > compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it > failing to provide a stub for sched_info_dequeue() in that case. Fixed. Pl. find the new patch below. Signed-off-by: Ankita Garg Acked-by: Peter Zijlstra Cc: Gregory Haskins Cc: rostedt@goodmis.org Cc: suresh.b.siddha@intel.com Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dhaval@linux.vnet.ibm.com Cc: vatsa@linux.vnet.ibm.com Cc: David Bahi Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_stats.h | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 34 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bbc40c3a065..996bc15196a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1714,6 +1714,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) p->se.last_wakeup = 0; } + sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 80179ef7450..8385d43987e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.cpu_time += delta; } + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) # define schedstat_set(var, val) do { var = (val); } while (0) @@ -126,6 +133,9 @@ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) {} static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} # define schedstat_inc(rq, field) do { } while (0) @@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #endif #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) * active queue, thus delaying tasks in the expired queue from running; * see scheduler_tick()). * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * Though we are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. */ static inline void sched_info_dequeued(struct task_struct *t) { - t->sched_info.last_queued = 0; + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); } /* @@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t) if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; - sched_info_dequeued(t); + sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) __sched_info_switch(prev, next); } #else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -- cgit v1.2.3 From cde53535991fbb5c34a1566f25955297c1487b8d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 4 Jul 2008 09:59:22 -0700 Subject: Christoph has moved Remove all clameter@sgi.com addresses from the kernel tree since they will become invalid on June 27th. Change my maintainer email address for the slab allocators to cl@linux-foundation.org (which will be the new email address for the future). Signed-off-by: Christoph Lameter Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Stephen Rothwell Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29fc39f1029..ce7799540c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -13,7 +13,7 @@ * Kai Petzke * Theodore Ts'o * - * Made to use alloc_percpu by Christoph Lameter . + * Made to use alloc_percpu by Christoph Lameter. */ #include -- cgit v1.2.3 From 086f7316f0d400806d76323beefae996bb3849b1 Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Fri, 4 Jul 2008 09:59:58 -0700 Subject: security: filesystem capabilities: fix fragile setuid fixup code This commit includes a bugfix for the fragile setuid fixup code in the case that filesystem capabilities are supported (in access()). The effect of this fix is gated on filesystem capability support because changing securebits is only supported when filesystem capabilities support is configured.) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew G. Morgan Acked-by: Serge Hallyn Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index cfbe4429948..901e0fdc3ff 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -121,6 +121,27 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) * uninteresting and/or not to be changed. */ +/* + * Atomically modify the effective capabilities returning the original + * value. No permission check is performed here - it is assumed that the + * caller is permitted to set the desired effective capabilities. + */ +kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) +{ + kernel_cap_t pE_old; + + spin_lock(&task_capability_lock); + + pE_old = current->cap_effective; + current->cap_effective = pE_new; + + spin_unlock(&task_capability_lock); + + return pE_old; +} + +EXPORT_SYMBOL(cap_set_effective); + /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and -- cgit v1.2.3 From 3b7253238801a7b97b3929d8db2fa7a0721fb17b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 16 Jun 2008 15:51:08 -0700 Subject: softlockup: print a module list on being stuck Most places in the kernel that go BUG: print a module list (which is very useful for doing statistics and finding patterns), however the softlockup detector does not do this yet. This patch adds the one line change to fix this gap. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index c828c2339cc..a272d78185e 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -120,6 +120,7 @@ void softlockup_tick(void) printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", this_cpu, now - touch_timestamp, current->comm, task_pid_nr(current)); + print_modules(); if (regs) show_regs(regs); else -- cgit v1.2.3 From aa276e1cafb3ce9d01d1e837bcd67e92616013ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Jun 2008 19:15:00 +0200 Subject: x86, clockevents: add C1E aware idle function C1E on AMD machines is like C3 but without control from the OS. Up to now we disabled the local apic timer for those machines as it stops when the CPU goes into C1E. This excludes those machines from high resolution timers / dynamic ticks, which hurts especially X2 based laptops. The current boot time C1E detection has another, more serious flaw as well: some BIOSes do not enable C1E until the ACPI processor module is loaded. This causes systems to stop working after that point. To work nicely with C1E enabled machines we use a separate idle function, which checks on idle entry whether C1E was enabled in the Interrupt Pending Message MSR. This allows us to do timer broadcasting for C1E and covers the late enablement of C1E as well. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02e5ec..67f80c26170 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -30,6 +30,7 @@ struct tick_device tick_broadcast_device; static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); @@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why) CLOCK_EVT_MODE_SHUTDOWN); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - dev->features |= CLOCK_EVT_FEAT_DUMMY; + tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (cpu_isset(cpu, tick_broadcast_mask)) { + if (!tick_broadcast_force && + cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); -- cgit v1.2.3 From 076ac2af86c3b7f89ac31bc50a7508d3e035b786 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: sched, numa: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c, where appropriate. This saves some allocated space as well as many wasted cycles going through node entries that are non-existent. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94ead43eda6..bcc22b569ee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; @@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; *nodemask = node_to_cpumask(i); @@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { SCHED_CPUMASK_VAR(nodemask, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, send_covered, tmpmask); } - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { + for (j = 0; j < nr_node_ids; j++) { SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % MAX_NUMNODES; + int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); cpus_complement(*notcovered, *covered); @@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { -- cgit v1.2.3 From a29d1cfe9e9337aedeed505afddc8465ac709b87 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Jun 2008 13:19:08 +0200 Subject: printk: export console_drivers this symbol is needed by drivers/video/xen-fbfront.ko. [ cherry-picked from tip/core/printk ] Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 028ed75d486..1fb1382009f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress); static DECLARE_MUTEX(console_sem); static DECLARE_MUTEX(secondary_console_sem); struct console *console_drivers; +EXPORT_SYMBOL_GPL(console_drivers); + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's -- cgit v1.2.3 From 7683c57c489bd17795945f4ae1c1d73e7c7b38e3 Mon Sep 17 00:00:00 2001 From: Daniel Guilak Date: Tue, 8 Jul 2008 15:02:06 -0700 Subject: kernel/printk.c: Made printk_recursion_bug_msg static. Signed-off-by: Daniel Guilak Acked-by: Josh Triplett Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..e2129e83fd7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -666,7 +666,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) return retval; } -const char printk_recursion_bug_msg [] = +static const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; -- cgit v1.2.3 From 48627d8d23c34106c1365563604739a50343edaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Jul 2008 07:01:13 +0200 Subject: genirq: remove extraneous checks in manage.c In http://bugzilla.kernel.org/show_bug.cgi?id=9580 it was pointed out that the desc->chip checks are extraneous. In fact these are left overs from early development and can be removed safely. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 469814e9b9e..77a51be3601 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -377,7 +377,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - if (desc->chip && desc->chip->set_type) + if (desc->chip->set_type) desc->chip->set_type(irq, new->flags & IRQF_TRIGGER_MASK); else @@ -387,8 +387,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) */ printk(KERN_WARNING "No IRQF_TRIGGER set_type " "function for IRQ %d (%s)\n", irq, - desc->chip ? desc->chip->name : - "unknown"); + desc->chip->name); } else compat_irq_chip_set_default_handler(desc); -- cgit v1.2.3 From dc7fab8b3bb388c57c6c4a43ba68c8a32ca25204 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 10 Jul 2008 00:32:40 +0200 Subject: sched: fix cpu hotplug I think we may have a race between try_to_wake_up() and migrate_live_tasks() -> move_task_off_dead_cpu() when the later one may end up looping endlessly. Interrupts are enabled on other CPUs when migration_call(CPU_DEAD, ...) is called so we may get a race between try_to_wake_up() and migrate_live_tasks() -> move_task_off_dead_cpu(). The former one may push a task out of a dead CPU causing the later one to loop endlessly. Heiko Carstens observed: | That's exactly what explains a dump I got yesterday. Thanks for fixing! :) Signed-off-by: Dmitry Adamushko Cc: miaox@cn.fujitsu.com Cc: Lai Jiangshan Cc: Heiko Carstens Cc: Peter Zijlstra Cc: Avi Kivity Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94ead43eda6..9397b871013 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5621,8 +5621,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) double_rq_lock(rq_src, rq_dest); /* Already moved. */ - if (task_cpu(p) != src_cpu) + if (task_cpu(p) != src_cpu) { + ret = 1; goto out; + } /* Affinity changed (again). */ if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; -- cgit v1.2.3 From 544304b200c3869bc1312bcf941c4cf04d65b56c Mon Sep 17 00:00:00 2001 From: Daniel Guilak Date: Thu, 10 Jul 2008 09:38:19 -0700 Subject: kernel/kprobes.c: Made kprobe_blacklist static. Signed-off-by: Daniel Guilak Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d4998f81e22..1485ca8d0e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -79,7 +79,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; * * For such cases, we now have a blacklist */ -struct kprobe_blackpoint kprobe_blacklist[] = { +static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, {NULL} /* Terminator */ }; -- cgit v1.2.3 From 70ff05554f91a1edda1f11684da1dbde09e2feea Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 10 Jul 2008 17:25:35 +1000 Subject: Fix PREEMPT_RCU without HOTPLUG_CPU PREEMPT_RCU without HOTPLUG_CPU is broken. The rcu_online_cpu is called to initially populate rcu_cpu_online_map with all online CPUs when the hotplug event handler is installed, and also to populate the map with CPUs as they come online. The former case is meant to happen with and without HOTPLUG_CPU, but without HOTPLUG_CPU, the rcu_offline_cpu function is no-oped -- while it still gets called, it does not set the rcu CPU map. With a blank RCU CPU map, grace periods get to tick by completely oblivious to active RCU read side critical sections. This results in free-before-grace bugs. Fix is obvious once the problem is known. (Also, change __devinit to __cpuinit so the function gets thrown away on !HOTPLUG_CPU kernels). Signed-off-by: Nick Piggin Reported-and-tested-by: Alexey Dobriyan Acked-by: Ingo Molnar Cc: Paul E. McKenney [ Nick is my personal hero of the day - Linus ] Signed-off-by: Linus Torvalds --- kernel/rcupreempt.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 5e02b774070..41d275a81df 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -925,26 +925,22 @@ void rcu_offline_cpu(int cpu) spin_unlock_irqrestore(&rdp->lock, flags); } -void __devinit rcu_online_cpu(int cpu) -{ - unsigned long flags; - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); -} - #else /* #ifdef CONFIG_HOTPLUG_CPU */ void rcu_offline_cpu(int cpu) { } -void __devinit rcu_online_cpu(int cpu) +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +void __cpuinit rcu_online_cpu(int cpu) { -} + unsigned long flags; -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); + cpu_set(cpu, rcu_cpu_online_map); + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); +} static void rcu_process_callbacks(struct softirq_action *unused) { -- cgit v1.2.3 From b1e387348a2a70954312b102d0589c3e2ca3dba1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 10 Jul 2008 11:25:03 -0700 Subject: sched: fix cpu hotplug, cleanup Clean up __migrate_task(): to just have separate "done" and "fail" cases, instead of that "out" case with random error behavior. Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9397b871013..4e2f6033565 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5621,13 +5621,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) double_rq_lock(rq_src, rq_dest); /* Already moved. */ - if (task_cpu(p) != src_cpu) { - ret = 1; - goto out; - } + if (task_cpu(p) != src_cpu) + goto done; /* Affinity changed (again). */ if (!cpu_isset(dest_cpu, p->cpus_allowed)) - goto out; + goto fail; on_rq = p->se.on_rq; if (on_rq) @@ -5638,8 +5636,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p); } +done: ret = 1; -out: +fail: double_rq_unlock(rq_src, rq_dest); return ret; } -- cgit v1.2.3 From 007c05d4d2ce42fabd58cb54ed98e0a1714d9d86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:09 -0400 Subject: ftrace: move sched_switch enable after markers We have two markers now that are enabled on sched_switch. One that records the context switching and the other that records task wake ups. Currently we enable the tracing first and then set the markers. This causes some confusing traces: # tracer: sched_switch # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | trace-cmd-3973 [00] 115.834817: 3973:120:R + 3: 0:S trace-cmd-3973 [01] 115.834910: 3973:120:R + 6: 0:S trace-cmd-3973 [02] 115.834910: 3973:120:R + 9: 0:S trace-cmd-3973 [03] 115.834910: 3973:120:R + 12: 0:S trace-cmd-3973 [02] 115.834910: 3973:120:R + 9: 0:S -0 [02] 115.834910: 0:140:R ==> 3973:120:R Here we see that trace-cmd with PID 3973 wakes up task 9 but the next line shows the idle task doing a context switch to task 3973. Enabling the tracing to _after_ the markers are set creates a much saner output: # tracer: sched_switch # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | -0 [02] 7922.634225: 0:140:R ==> 4790:120:R trace-cmd-4789 [03] 7922.634225: 0:140:R + 4790:120:R Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 93a66200915..cb817a209aa 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -227,14 +227,14 @@ void tracing_stop_cmdline_record(void) static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - tracer_enabled = 1; tracing_start_cmdline_record(); + tracer_enabled = 1; } static void stop_sched_trace(struct trace_array *tr) { - tracing_stop_cmdline_record(); tracer_enabled = 0; + tracing_stop_cmdline_record(); } static void sched_switch_trace_init(struct trace_array *tr) -- cgit v1.2.3 From 001b6767b1d0c89e458e5ddb039245b268f569fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:10 -0400 Subject: ftrace: define function trace nop When CONFIG_FTRACE is not enabled, the tracing_start_functon_trace and tracing_stop_function_trace should be nops. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b8bd8800d0..df840f42be3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -216,8 +216,6 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags); -void tracing_start_function_trace(void); -void tracing_stop_function_trace(void); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); int register_tracer(struct tracer *type); @@ -234,6 +232,14 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); +#ifdef CONFIG_FTRACE +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +#else +# define tracing_start_function_trace() do { } while (0) +# define tracing_stop_function_trace() do { } while (0) +#endif + #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, -- cgit v1.2.3 From 1e16c0a081f6c93f04c6af784d6a160955269f91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:11 -0400 Subject: ftrace: trace schedule After the sched_clock code has been removed from sched.c we can now trace the scheduler. The scheduler has a lot of functions that would be worth tracing. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ca2433e8487..480976275d9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -pg -mno-spe +CFLAGS_REMOVE_sched.o = -mno-spe ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files -- cgit v1.2.3 From b5c21b4514b38f450848feb432f7120376d01ffe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:12 -0400 Subject: ftrace: check proper config for preempt type There is no CONFIG_PREEMPT_DESKTOP. Use the proper entry CONFIG_PREEMPT. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ade79369bf..f8fdb9cedc2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1341,7 +1341,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) "desktop", -#elif defined(CONFIG_PREEMPT_DESKTOP) +#elif defined(CONFIG_PREEMPT) "preempt", #else "unknown", -- cgit v1.2.3 From ad591240ceadcaf41b2a88855ca5f1c77c5a0298 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:13 -0400 Subject: ftrace: start wakeup tracing after setting function tracer Enabling the wakeup tracer before enabling the function tracing causes some strange results due to the dynamic enabling of the functions. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index bf7e91caef5..3c8d61df447 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -352,9 +352,10 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - tracer_enabled = 1; register_ftrace_function(&trace_ops); + tracer_enabled = 1; + return; fail_deprobe_wake_new: marker_probe_unregister("kernel_sched_wakeup_new", -- cgit v1.2.3 From 26bc83f4cb911a0b4dabfe23b700aaf3235f2955 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:14 -0400 Subject: ftrace: use current CPU for function startup This is more of a clean up. Currently the function tracer initializes the tracer with which ever CPU was last used for tracing. This value isn't realy useful for function tracing, but at least it should be something other than a random number. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7ee7dcd76b7..31214489797 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,10 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { + tr->cpu = get_cpu(); function_reset(tr); + put_cpu(); + tracing_start_cmdline_record(); tracing_start_function_trace(); } -- cgit v1.2.3 From a2bb6a3d85ef3124cd336403a95abc0540d3fbe2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:15 -0400 Subject: ftrace: add ftrace_kill_atomic It has been suggested that I add a way to disable the function tracer on an oops. This code adds a ftrace_kill_atomic. It is not meant to be used in normal situations. It will disable the ftrace tracer, but will not perform the nice shutdown that requires scheduling. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0f271c45cd0..1359632668a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1601,6 +1601,21 @@ core_initcall(ftrace_dynamic_init); # define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +/** + * ftrace_kill_atomic - kill ftrace from critical sections + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill_atomic(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + ftraced_suspend = -1; + clear_ftrace_function(); +} + /** * ftrace_kill - totally shutdown ftrace * -- cgit v1.2.3 From 60bc080090e3bf6afa29c62cb25f913706551010 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:16 -0400 Subject: ftrace: separate out the function enabled variable Currently the function tracer uses the global tracer_enabled variable that is used to keep track if the tracer is enabled or not. The function tracing startup needs to be separated out, otherwise the internal happenings of the tracer startup is also recorded. This patch creates a ftrace_function_enabled variable to all the starting of the function traces to happen after everything has been started. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8fdb9cedc2..2e37857f7df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -96,6 +96,9 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; +/* function tracing enabled */ +int ftrace_function_enabled; + /* * trace_nr_entries is the number of entries that is allocated * for a buffer. Note, the number of entries is always rounded @@ -134,6 +137,7 @@ static notrace void no_trace_init(struct trace_array *tr) { int cpu; + ftrace_function_enabled = 0; if(tr->ctrl) for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); @@ -985,7 +989,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (unlikely(!tracer_enabled)) + if (unlikely(!ftrace_function_enabled)) return; if (skip_trace(ip)) @@ -1010,11 +1014,15 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { + ftrace_function_enabled = 0; register_ftrace_function(&trace_ops); + if (tracer_enabled) + ftrace_function_enabled = 1; } void tracing_stop_function_trace(void) { + ftrace_function_enabled = 0; unregister_ftrace_function(&trace_ops); } #endif @@ -1850,8 +1858,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) m->private = iter; /* stop the trace while dumping */ - if (iter->tr->ctrl) + if (iter->tr->ctrl) { tracer_enabled = 0; + ftrace_function_enabled = 0; + } if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -1884,8 +1894,14 @@ int tracing_release(struct inode *inode, struct file *file) iter->trace->close(iter); /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) + if (iter->tr->ctrl) { tracer_enabled = 1; + /* + * It is safe to enable function tracing even if it + * isn't used + */ + ftrace_function_enabled = 1; + } mutex_unlock(&trace_types_lock); seq_release(inode, file); -- cgit v1.2.3 From 62c43dd9864dbd52ff158922d1d08c75f20335af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:50 -0400 Subject: sched_clock: record from last tick The sched_clock code tries to keep within the gtod time by one tick (jiffy). The current code mistakenly keeps track of the delta jiffies between updates of the clock, where the the delta is used to compare with the number of jiffies that have past since an update of the gtod. The gtod is updated at each schedule tick not each sched_clock update. After one jiffy passes the clock is updated fine. But the delta is taken from the last update so if the next update happens before the next tick the delta jiffies used will be incorrect. This patch changes the code to check the delta of jiffies between ticks and not updates to match the comparison of the updates with the gtod. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219a..e383bc7df6d 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -40,7 +40,7 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long prev_jiffies; + unsigned long tick_jiffies; u64 prev_raw; u64 tick_raw; u64 tick_gtod; @@ -71,7 +71,7 @@ void sched_clock_init(void) struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->prev_jiffies = now_jiffies; + scd->tick_jiffies = now_jiffies; scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; @@ -90,7 +90,7 @@ void sched_clock_init(void) static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->prev_jiffies; + long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; s64 delta = now - scd->prev_raw; @@ -119,7 +119,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = min_clock; scd->prev_raw = now; - scd->prev_jiffies = now_jiffies; scd->clock = clock; } @@ -179,6 +178,7 @@ u64 sched_clock_cpu(int cpu) void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); + unsigned long now_jiffies = jiffies; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -196,6 +196,7 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ + scd->tick_jiffies = now_jiffies; scd->tick_raw = now; scd->tick_gtod = now_gtod; __raw_spin_unlock(&scd->lock); -- cgit v1.2.3 From f7cce27f5605b9e137b829a47949cb2d3c7e1cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:51 -0400 Subject: sched_clock: widen the max and min time With keeping the max and min sched time within one jiffy of the gtod clock was too tight. Just before a schedule tick the max could easily be hit, as well as just after a schedule_tick the min could be hit. This caused the clock to jump around by a jiffy. This patch widens the minimum to last gtod + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSECS and the maximum to last gtod + (2 + delta_jiffies) * TICK_NSECS This keeps the minum to gtod or if one jiffy less than delta jiffies and the maxim 2 jiffies ahead of gtod. This may cause unstable TSCs to be a bit more sporadic, but it helps keep a clock with a stable TSC working well. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e383bc7df6d..42b81fa38cb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -96,14 +96,21 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + min_clock = scd->tick_gtod + + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - max_clock = min_clock + TICK_NSEC; + /* + * The clock must stay within a jiffie of the gtod. + * But since we may be at the start of a jiffy or the end of one + * we add another jiffy buffer. + */ + max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) -- cgit v1.2.3 From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cb..97159e225a7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591..d63008b09a4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3 From 2b8a0cf4890d7537a77b51caa8f508e4a05a0e67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 19:49:41 -0400 Subject: sched_clock: fix calculation of other CPU The algorithm to calculate the 'now' of another CPU is not correct. At each scheduler tick, each CPU records the last sched_clock and gtod (tick_raw and tick_gtod respectively). If the TSC is somewhat the same in speed between two clocks the algorithm would be: tick_gtod1 + (now1 - tick_raw1) = tick_gtod2 + (now2 - tick_raw2) To calculate now2 we would have: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw2 - tick_raw1) + now1 Currently the algorithm is: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw1 - tick_raw2) + now1 This solves most of the rest of the issues I've had with timestamps in ftace. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: john stultz Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 97159e225a7..55fca1e9e12 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -203,8 +203,8 @@ u64 sched_clock_cpu(int cpu) now -= my_scd->tick_raw; now += scd->tick_raw; - now -= my_scd->tick_gtod; - now += scd->tick_gtod; + now += my_scd->tick_gtod; + now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { -- cgit v1.2.3 From c0c87734f125d2fa8ebc70310f3257fa6209f2b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:31 -0400 Subject: sched_clock: only update deltas with local reads. Reading the CPU clock should try to stay accurate within the CPU. By reading the CPU clock from another CPU and updating the deltas can cause unneeded jumps when reading from the local CPU. This patch changes the code to update the last read TSC only when read from the local CPU. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 55fca1e9e12..ee7cce5029c 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static int check_max(struct sched_clock_data *scd) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -162,8 +162,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; - scd->clock = clock; + if (time) + *time = clock; + else { + scd->prev_raw = now; + scd->clock = clock; + } } static void lock_double_clock(struct sched_clock_data *data1, @@ -207,15 +211,18 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); + + __update_sched_clock(scd, now, &clock); + + __raw_spin_unlock(&scd->lock); + } else { __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now, NULL); + clock = scd->clock; + __raw_spin_unlock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; - - __raw_spin_unlock(&scd->lock); - return clock; } @@ -234,7 +241,7 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); + __update_sched_clock(scd, now, NULL); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would -- cgit v1.2.3 From a83bc47c33ab182f1e48977fd5a04024d713c75e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:32 -0400 Subject: sched_clock: record TSC after gtod To read the gtod we need to grab the xtime lock for read. Reading the gtod before the TSC can cause a bigger gab if the xtime lock is contended. This patch simply reverses the order to read the TSC after the gtod. The locking in the reading of the gtod handles any barriers one might think is needed. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ee7cce5029c..28ff6bf5e02 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -237,8 +237,8 @@ void sched_clock_tick(void) WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); __raw_spin_lock(&scd->lock); __update_sched_clock(scd, now, NULL); -- cgit v1.2.3 From c300ba252829e9325e08f0af60687add94445b25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:33 -0400 Subject: sched_clock: and multiplier for TSC to gtod drift The sched_clock code currently tries to keep all CPU clocks of all CPUS somewhat in sync. At every clock tick it records the gtod clock and uses that and jiffies and the TSC to calculate a CPU clock that tries to stay in sync with all the other CPUs. ftrace depends heavily on this timer and it detects when this timer "jumps". One problem is that the TSC and the gtod also drift. When the TSC is 0.1% faster or slower than the gtod it is very noticeable in ftrace. To help compensate for this, I've added a multiplier that tries to keep the CPU clock updating at the same rate as the gtod. I've tried various ways to get it to be in sync and this ended up being the most reliable. At every scheduler tick we calculate the new multiplier: multi = delta_gtod / delta_TSC This means we perform a 64 bit divide at the tick (once a HZ). A shift is used to handle the accuracy. Other methods that failed due to dynamic HZ are: (not used) multi += (gtod - tsc) / delta_gtod (not used) multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod as well as other variants. This code still allows for a slight drift between TSC and gtod, but it keeps the damage down to a minimum. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 28ff6bf5e02..8affbfd0cdb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -3,6 +3,9 @@ * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt + * * Based on code by: * Ingo Molnar * Guillaume Chazarain @@ -32,6 +35,11 @@ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +#define MULTI_SHIFT 15 +/* Max is double, Min is 1/2 */ +#define MAX_MULTI (2LL << MULTI_SHIFT) +#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) + struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -45,6 +53,7 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; + s64 multi; #ifdef CONFIG_NO_HZ int check_max; #endif @@ -79,6 +88,7 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; + scd->multi = 1 << MULTI_SHIFT; #ifdef CONFIG_NO_HZ scd->check_max = 1; #endif @@ -134,8 +144,13 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + - (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; + /* + * At schedule tick the clock can be just under the gtod. We don't + * want to push it too prematurely. + */ + min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); + if (min_clock > TICK_NSEC) + min_clock -= TICK_NSEC / 2; if (unlikely(delta < 0)) { clock++; @@ -149,6 +164,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; + delta *= scd->multi; + delta >>= MULTI_SHIFT; + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; @@ -230,6 +248,7 @@ void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); unsigned long now_jiffies = jiffies; + s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -247,9 +266,23 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - scd->tick_jiffies = now_jiffies; + delta_gtod = now_gtod - scd->tick_gtod; + delta_raw = now - scd->tick_raw; + + if ((long)delta_raw > 0) { + mult = delta_gtod << MULTI_SHIFT; + do_div(mult, delta_raw); + scd->multi = mult; + if (scd->multi > MAX_MULTI) + scd->multi = MAX_MULTI; + else if (scd->multi < MIN_MULTI) + scd->multi = MIN_MULTI; + } else + scd->multi = 1 << MULTI_SHIFT; + scd->tick_raw = now; scd->tick_gtod = now_gtod; + scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -279,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; + scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); -- cgit v1.2.3 From b2613e370dbeb69edbff989382fa54f2395aa471 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Jul 2008 16:44:27 +0200 Subject: ftrace: build fix for ftraced_suspend fix: kernel/trace/ftrace.c:1615: error: 'ftraced_suspend' undeclared (first use in this function) kernel/trace/ftrace.c:1615: error: (Each undeclared identifier is reported only once kernel/trace/ftrace.c:1615: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1359632668a..4231a3dc224 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1612,7 +1612,9 @@ void ftrace_kill_atomic(void) { ftrace_disabled = 1; ftrace_enabled = 0; +#ifdef CONFIG_DYNAMIC_FTRACE ftraced_suspend = -1; +#endif clear_ftrace_function(); } -- cgit v1.2.3 From 3e84050c81ffb4961ef43d20e1fb1d7607167d83 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Sun, 13 Jul 2008 02:10:29 +0200 Subject: cpusets, hotplug, scheduler: fix scheduler domain breakage Commit f18f982ab ("sched: CPU hotplug events must not destroy scheduler domains created by the cpusets") introduced a hotplug-related problem as described below: Upon CPU_DOWN_PREPARE, update_sched_domains() -> detach_destroy_domains(&cpu_online_map) does the following: /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing * code, so we temporarily attach all running cpus to the NULL domain * which will prevent rebalancing while the sched domains are recalculated. */ The sched-domains should be rebuilt when a CPU_DOWN ops. has been completed, effectively either upon CPU_DEAD{_FROZEN} (upon success) or CPU_DOWN_FAILED{_FROZEN} (upon failure -- restore the things to their initial state). That's what update_sched_domains() also does but only for !CPUSETS case. With f18f982ab, sched-domains' reinitialization is delegated to CPUSETS code: cpuset_handle_cpuhp() -> common_cpu_mem_hotplug_unplug() -> rebuild_sched_domains() Being called for CPU_UP_PREPARE and if its callback is called after update_sched_domains()), it just negates all the work done by update_sched_domains() -- i.e. a soon-to-be-offline cpu is included in the sched-domains and that makes it visible for the load-balancer while the CPU_DOWN ops. is in progress. __migrate_live_tasks() moves the tasks off a 'dead' cpu (it's already "offline" when this function is called). try_to_wake_up() is called for one of these tasks from another CPU -> the load-balancer (wake_idle()) picks up a "dead" CPU and places the task on it. Then e.g. BUG_ON(rq->nr_running) detects this a bit later -> oops. Signed-off-by: Dmitry Adamushko Tested-by: Vegard Nossum Cc: Paul Menage Cc: Max Krasnyansky Cc: Paul Jackson Cc: Peter Zijlstra Cc: miaox@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fceb97e989..798b3ab054e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1882,7 +1882,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) * in order to minimize text size. */ -static void common_cpu_mem_hotplug_unplug(void) +static void common_cpu_mem_hotplug_unplug(int rebuild_sd) { cgroup_lock(); @@ -1894,7 +1894,8 @@ static void common_cpu_mem_hotplug_unplug(void) * Scheduler destroys domains on hotplug events. * Rebuild them based on the current settings. */ - rebuild_sched_domains(); + if (rebuild_sd) + rebuild_sched_domains(); cgroup_unlock(); } @@ -1912,11 +1913,22 @@ static void common_cpu_mem_hotplug_unplug(void) static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { - if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) + switch (phase) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + common_cpu_mem_hotplug_unplug(1); + break; + default: return NOTIFY_DONE; + } - common_cpu_mem_hotplug_unplug(); - return 0; + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -1929,7 +1941,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(); + common_cpu_mem_hotplug_unplug(0); } #endif -- cgit v1.2.3 From 199a952876adbfc2b6c13b8b07adabebf4ff54b2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 26 Jun 2008 10:06:43 +0800 Subject: rcu classic: update qlen when cpu offline When callbacks are moved from offline cpu to this cpu, the qlen field of this rdp should be updated. [ Paul E. McKenney: ] The effect of this bug would be for force_quiescent_state() to be invoked when it should not and vice versa -- wasting cycles in the first case and letting RCU callbacks remain piled up in the second case. The bug is thus "benign" in that it does not result in premature grace-period termination, but should of course be fixed nonetheless. Preemption is disabled by the caller's get_cpu_var(), so we are guaranteed to remain on the same CPU, as required. The local_irq_disable() is indeed needed, otherwise, an interrupt might invoke call_rcu() or call_rcu_bh(), which could cause that interrupt's increment of ->qlen to be lost. Signed-off-by: Lai Jiangshan Cc: Andrew Morton Reviewed-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 214e1cde981..529190c485f 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -387,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + + local_irq_disable(); + this_rdp->qlen += rdp->qlen; + local_irq_enable(); } static void rcu_offline_cpu(int cpu) -- cgit v1.2.3 From 006ebb40d3d65338bd74abb03b945f8d60e362bd Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 19 May 2008 08:32:49 -0400 Subject: Security: split proc ptrace checking into read vs. attach Enable security modules to distinguish reading of process state via proc from full ptrace access by renaming ptrace_may_attach to ptrace_may_access and adding a mode argument indicating whether only read access or full attach access is requested. This allows security modules to permit access to reading process state without granting full ptrace access. The base DAC/capability checking remains unchanged. Read access to /proc/pid/mem continues to apply a full ptrace attach check since check_mem_permission() already requires the current task to already be ptracing the target. The other ptrace checks within proc for elements like environ, maps, and fds are changed to pass the read mode instead of attach. In the SELinux case, we model such reading of process state as a reading of a proc file labeled with the target process' label. This enables SELinux policy to permit such reading of process state without permitting control or manipulation of the target process, as there are a number of cases where programs probe for such information via proc but do not need to be able to control the target (e.g. procps, lsof, PolicyKit, ConsoleKit). At present we have to choose between allowing full ptrace in policy (more permissive than required/desired) or breaking functionality (or in some cases just silencing the denials via dontaudit rules but this can hide genuine attacks). This version of the patch incorporates comments from Casey Schaufler (change/replace existing ptrace_may_attach interface, pass access mode), and Chris Wright (provide greater consistency in the checking). Note that like their predecessors __ptrace_may_attach and ptrace_may_attach, the __ptrace_may_access and ptrace_may_access interfaces use different return value conventions from each other (0 or -errno vs. 1 or 0). I retained this difference to avoid any changes to the caller logic but made the difference clearer by changing the latter interface to return a bool rather than an int and by adding a comment about it to ptrace.h for any future callers. Signed-off-by: Stephen Smalley Acked-by: Chris Wright Signed-off-by: James Morris --- kernel/ptrace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6c19e94fd0a..e337390fce0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } -int __ptrace_may_attach(struct task_struct *task) +int __ptrace_may_access(struct task_struct *task, unsigned int mode) { /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task); + return security_ptrace(current, task, mode); } -int ptrace_may_attach(struct task_struct *task) +bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; task_lock(task); - err = __ptrace_may_attach(task); + err = __ptrace_may_access(task, mode); task_unlock(task); - return !err; + return (!err ? true : false); } int ptrace_attach(struct task_struct *task) @@ -195,7 +195,7 @@ repeat: /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = __ptrace_may_attach(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); if (retval) goto bad; @@ -494,7 +494,8 @@ int ptrace_traceme(void) */ task_lock(current); if (!(current->ptrace & PT_PTRACED)) { - ret = security_ptrace(current->parent, current); + ret = security_ptrace(current->parent, current, + PTRACE_MODE_ATTACH); /* * Set the ptrace bit in the process ptrace flags. */ -- cgit v1.2.3 From 992860e991f2015fb8c8df65aa32afa0dcbb4430 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Jul 2008 10:28:38 +0200 Subject: lockdep: fix ftrace irq tracing false positive fix this false positive: [ 0.020000] ------------[ cut here ]------------ [ 0.020000] WARNING: at kernel/lockdep.c:2718 check_flags+0x14a/0x170() [ 0.020000] Modules linked in: [ 0.020000] Pid: 0, comm: swapper Not tainted 2.6.26-tip-00343-gd7e5521-dirty #14486 [ 0.020000] [] warn_on_slowpath+0x54/0x80 [ 0.020000] [] ? _spin_unlock_irqrestore+0x61/0x70 [ 0.020000] [] ? release_console_sem+0x201/0x210 [ 0.020000] [] ? __kernel_text_address+0x35/0x40 [ 0.020000] [] ? dump_trace+0x5e/0x140 [ 0.020000] [] ? __lock_acquire+0x245/0x820 [ 0.020000] [] check_flags+0x14a/0x170 [ 0.020000] [] ? lock_acquire+0x48/0xc0 [ 0.020000] [] lock_acquire+0x51/0xc0 [ 0.020000] [] ? down+0x2c/0x40 [ 0.020000] [] ? sched_clock+0x9/0x10 [ 0.020000] [] _write_lock+0x32/0x60 [ 0.020000] [] ? request_resource+0x1f/0xb0 [ 0.020000] [] request_resource+0x1f/0xb0 [ 0.020000] [] vgacon_startup+0x2bd/0x3e0 [ 0.020000] [] con_init+0x19/0x22f [ 0.020000] [] ? tty_register_ldisc+0x5c/0x70 [ 0.020000] [] console_init+0x20/0x2e [ 0.020000] [] start_kernel+0x20c/0x379 [ 0.020000] [] ? unknown_bootoption+0x0/0x1f6 [ 0.020000] [] __init_begin+0x99/0xa1 [ 0.020000] ======================= [ 0.020000] ---[ end trace 4eaa2a86a8e2da22 ]--- [ 0.020000] possible reason: unannotated irqs-on. [ 0.020000] irq event stamp: 0 which occurs if CONFIG_TRACE_IRQFLAGS=y, CONFIG_DEBUG_LOCKDEP=y, but CONFIG_PROVE_LOCKING is disabled. Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7553a28b99c..fc5d5aabd77 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2688,7 +2688,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) */ static void check_flags(unsigned long flags) { -#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ + defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) return; -- cgit v1.2.3 From d12c1a37925a8ec386994169605fe99217295199 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Jul 2008 12:09:28 +0200 Subject: lockdep: fix kernel/fork.c warning fix: [ 0.184011] ------------[ cut here ]------------ [ 0.188011] WARNING: at kernel/fork.c:918 copy_process+0x1c0/0x1084() [ 0.192011] Pid: 0, comm: swapper Not tainted 2.6.26-tip-00351-g01d4a50-dirty #14521 [ 0.196011] [] warn_on_slowpath+0x3c/0x60 [ 0.200012] [] ? __alloc_pages_internal+0x92/0x36b [ 0.208012] [] ? __spin_lock_init+0x24/0x4a [ 0.212012] [] copy_process+0x1c0/0x1084 [ 0.216013] [] do_fork+0xb8/0x1ad [ 0.220013] [] ? acpi_os_release_lock+0x8/0xa [ 0.228013] [] ? acpi_os_vprintf+0x20/0x24 [ 0.232014] [] kernel_thread+0x75/0x7d [ 0.236014] [] ? kernel_init+0x0/0x24a [ 0.240014] [] ? kernel_init+0x0/0x24a [ 0.244014] [] ? kernel_thread_helper+0x0/0x10 [ 0.252015] [] rest_init+0x14/0x50 [ 0.256015] [] start_kernel+0x2b9/0x2c0 [ 0.260015] [] __init_begin+0x4f/0x57 [ 0.264016] ======================= [ 0.268016] ---[ end trace 4eaa2a86a8e2da22 ]--- [ 0.272016] enabled ExtINT on CPU#0 which occurs if CONFIG_TRACE_IRQFLAGS=y, CONFIG_DEBUG_LOCKDEP=y, but CONFIG_PROVE_LOCKING is disabled. Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..cdb1f82d3bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif -- cgit v1.2.3 From 63cf13b77ab785e87c867defa8545e6d4a989774 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:22:49 -0700 Subject: generic ipi function calls: wait on alloc failure fallback When a GFP_ATOMIC allocation fails, it falls back to allocating the data on the stack and converting it to a waiting call. Make sure we actually wait in this case. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Linus Torvalds --- kernel/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ab10793b070..462c785ca1e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -312,6 +312,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, if (!data) { data = &d; data->csd.flags = CSD_FLAG_WAIT; + wait = 1; } spin_lock_init(&data->lock); -- cgit v1.2.3 From b62b8ef906cdf7115af579ce7378886ce3e0ce00 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 29 Apr 2008 02:35:56 -0400 Subject: force offline the processor during hot-removal The ACPI device node for the cpu has already been unregistered when acpi_processor_handle_eject is called. Thus we should offline the cpu and continue, rather than a failure here. http://bugzilla.kernel.org/show_bug.cgi?id=9772 Signed-off-by: Zhang Rui Signed-off-by: Len Brown Signed-off-by: Andi Kleen --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b11f06dc149..cfb1d43ab80 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -299,6 +299,7 @@ int __ref cpu_down(unsigned int cpu) cpu_maps_update_done(); return err; } +EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -- cgit v1.2.3 From ebb12db51f6c13b30752fcf506baad4c617b153c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Jun 2008 22:04:29 +0200 Subject: Freezer: Introduce PF_FREEZER_NOSIG The freezer currently attempts to distinguish kernel threads from user space tasks by checking if their mm pointer is unset and it does not send fake signals to kernel threads. However, there are kernel threads, mostly related to networking, that behave like user space tasks and may want to be sent a fake signal to be frozen. Introduce the new process flag PF_FREEZER_NOSIG that will be set by default for all kernel threads and make the freezer only send fake signals to the tasks having PF_FREEZER_NOSIG unset. Provide the set_freezable_with_signal() function to be called by the kernel threads that want to be sent a fake signal for freezing. This patch should not change the freezer's observable behavior. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/kthread.c | 2 +- kernel/power/process.c | 97 ++++++++++++++++++++++---------------------------- 2 files changed, 43 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 97747cdd37c..ac3fb732664 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -235,7 +235,7 @@ int kthreadd(void *unused) set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed(tsk, CPU_MASK_ALL); - current->flags |= PF_NOFREEZE; + current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; for (;;) { set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/power/process.c b/kernel/power/process.c index f1d0b345c9b..5fb87652f21 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -19,9 +19,6 @@ */ #define TIMEOUT (20 * HZ) -#define FREEZER_KERNEL_THREADS 0 -#define FREEZER_USER_SPACE 1 - static inline int freezeable(struct task_struct * p) { if ((p == current) || @@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static int has_mm(struct task_struct *p) +static inline bool should_send_signal(struct task_struct *p) { - return (p->mm && !(p->flags & PF_BORROWED_MM)); + return !(p->flags & PF_FREEZER_NOSIG); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to - * @with_mm_only: if set, the request will only be sent if the task has its - * own mm - * Return value: 0, if @with_mm_only is set and the task has no mm of its - * own or the task is frozen, 1, otherwise + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise * - * The freeze request is sent by seting the tasks's TIF_FREEZE flag and + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and * either sending a fake signal to it or waking it up, depending on whether - * or not it has its own mm (ie. it is a user land task). If @with_mm_only - * is set and the task has no mm of its own (ie. it is a kernel thread), - * its TIF_FREEZE flag should not be set. - * - * The task_lock() is necessary to prevent races with exit_mm() or - * use_mm()/unuse_mm() from occuring. + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. */ -static int freeze_task(struct task_struct *p, int with_mm_only) +static bool freeze_task(struct task_struct *p, bool sig_only) { - int ret = 1; + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; - task_lock(p); - if (freezing(p)) { - if (has_mm(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else { - if (with_mm_only) - ret = 0; - else - wake_up_state(p, TASK_INTERRUPTIBLE); - } + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; } else { - rmb(); - if (frozen(p)) { - ret = 0; - } else { - if (has_mm(p)) { - set_freeze_flag(p); - fake_signal_wake_up(p); - } else { - if (with_mm_only) { - ret = 0; - } else { - set_freeze_flag(p); - wake_up_state(p, TASK_INTERRUPTIBLE); - } - } - } + wake_up_state(p, TASK_INTERRUPTIBLE); } - task_unlock(p); - return ret; + + return true; } static void cancel_freezing(struct task_struct *p) @@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p) } } -static int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; unsigned long end_time; @@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (!freeze_task(p, freeze_user_space)) + if (!freeze_task(p, sig_only)) continue; /* @@ -235,13 +222,13 @@ int freeze_processes(void) int error; printk("Freezing user space processes ... "); - error = try_to_freeze_tasks(FREEZER_USER_SPACE); + error = try_to_freeze_tasks(true); if (error) goto Exit; printk("done.\n"); printk("Freezing remaining freezable tasks ... "); - error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + error = try_to_freeze_tasks(false); if (error) goto Exit; printk("done."); @@ -251,7 +238,7 @@ int freeze_processes(void) return error; } -static void thaw_tasks(int thaw_user_space) +static void thaw_tasks(bool nosig_only) { struct task_struct *g, *p; @@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space) if (!freezeable(p)) continue; - if (!p->mm == thaw_user_space) + if (nosig_only && should_send_signal(p)) continue; thaw_process(p); @@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space) void thaw_processes(void) { printk("Restarting tasks ... "); - thaw_tasks(FREEZER_KERNEL_THREADS); - thaw_tasks(FREEZER_USER_SPACE); + thaw_tasks(true); + thaw_tasks(false); schedule(); printk("done.\n"); } -- cgit v1.2.3 From 52d11025dba32bed696eaee1822b26529e764770 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 11 Jun 2008 22:07:52 +0200 Subject: snapshot: Push BKL down into ioctl handlers Push BKL down into ioctl handlers - snapshot device. Signed-off-by: Alan Cox Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown Signed-off-by: Andi Kleen --- kernel/power/user.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index f5512cb3aa8..658262b1599 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -164,8 +165,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static int snapshot_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = 0; struct snapshot_data *data; @@ -181,6 +182,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, data = filp->private_data; + lock_kernel(); + switch (cmd) { case SNAPSHOT_FREEZE: @@ -389,7 +392,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -ENOTTY; } - + unlock_kernel(); return error; } @@ -399,7 +402,7 @@ static const struct file_operations snapshot_fops = { .read = snapshot_read, .write = snapshot_write, .llseek = no_llseek, - .ioctl = snapshot_ioctl, + .unlocked_ioctl = snapshot_ioctl, }; static struct miscdevice snapshot_device = { -- cgit v1.2.3 From 25f2f3daadaf0768a61d02ee3ed3d9a21e9dc46c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 11 Jun 2008 22:09:45 +0200 Subject: snapshot: Use pm_mutex for mutual exclusion We can avoid taking the BKL in snapshot_ioctl() if pm_mutex is used to prevent the ioctls from being executed concurrently. In addition, although it is only possible to open /dev/snapshot once, the task which has done that may spawn a child that will inherit the open descriptor, so in theory they can call snapshot_write(), snapshot_read() and snapshot_release() concurrently. pm_mutex can also be used for mutual exclusion in such cases. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andi Kleen Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/user.c | 68 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 658262b1599..a6332a31326 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -70,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + mutex_lock(&pm_mutex); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { atomic_inc(&snapshot_device_available); - return -ENOSYS; + error = -ENOSYS; + goto Unlock; } if(create_basic_memory_bitmaps()) { atomic_inc(&snapshot_device_available); - return -ENOMEM; + error = -ENOMEM; + goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; @@ -99,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); } - if (error) { + if (error) atomic_inc(&snapshot_device_available); - return error; - } data->frozen = 0; data->ready = 0; data->platform_support = 0; - return 0; + Unlock: + mutex_unlock(&pm_mutex); + + return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + mutex_lock(&pm_mutex); + swsusp_free(); free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) { - mutex_lock(&pm_mutex); + if (data->frozen) thaw_processes(); - mutex_unlock(&pm_mutex); - } pm_notifier_call_chain(data->mode == O_WRONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); + + mutex_unlock(&pm_mutex); + return 0; } @@ -135,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; - if (!data->ready) - return -ENODATA; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } res = snapshot_read_next(&data->handle, count); if (res > 0) { if (copy_to_user(buf, data_of(data->handle), res)) @@ -145,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, else *offp = data->handle.offset; } + + Unlock: + mutex_unlock(&pm_mutex); + return res; } @@ -154,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, struct snapshot_data *data; ssize_t res; + mutex_lock(&pm_mutex); + data = filp->private_data; res = snapshot_write_next(&data->handle, count); if (res > 0) { @@ -162,6 +181,9 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, else *offp = data->handle.offset; } + + mutex_unlock(&pm_mutex); + return res; } @@ -180,16 +202,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - data = filp->private_data; + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; - lock_kernel(); + data = filp->private_data; switch (cmd) { case SNAPSHOT_FREEZE: if (data->frozen) break; - mutex_lock(&pm_mutex); printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); @@ -197,7 +219,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = freeze_processes(); if (error) thaw_processes(); - mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; break; @@ -205,9 +226,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; - mutex_lock(&pm_mutex); thaw_processes(); - mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -310,16 +329,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { - error = -EBUSY; - break; - } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - mutex_unlock(&pm_mutex); break; case SNAPSHOT_PLATFORM_SUPPORT: @@ -392,7 +406,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -ENOTTY; } - unlock_kernel(); + + mutex_unlock(&pm_mutex); + return error; } -- cgit v1.2.3 From 98abed02007b19bbfd68b6d06a5485afc3eeb01b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 19 Mar 2008 19:24:59 -0700 Subject: do_wait reorganization This breaks out the guts of do_wait into three subfunctions. The control flow is less nonobvious without so much goto. do_wait_thread and ptrace_do_wait contain the main work of the outer loop. wait_consider_task contains the main work of the inner loop. Signed-off-by: Roland McGrath --- kernel/exit.c | 215 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 135 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ceb25878283..7453356a961 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1238,7 +1238,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(struct task_struct *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1246,7 +1246,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap, int retval, status, traced; pid_t pid = task_pid_vnr(p); - if (unlikely(noreap)) { + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; @@ -1397,13 +1400,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * released the lock and the system call should return. */ static int wait_task_stopped(struct task_struct *p, - int noreap, struct siginfo __user *infop, + int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; + if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); @@ -1421,7 +1427,7 @@ static int wait_task_stopped(struct task_struct *p, if (!exit_code) goto unlock_sig; - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->exit_code = 0; uid = p->uid; @@ -1442,7 +1448,7 @@ unlock_sig: why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) + if (unlikely(options & WNOWAIT)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); @@ -1476,7 +1482,7 @@ unlock_sig: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(struct task_struct *p, int noreap, +static int wait_task_continued(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1484,6 +1490,9 @@ static int wait_task_continued(struct task_struct *p, int noreap, pid_t pid; uid_t uid; + if (!unlikely(options & WCONTINUED)) + return 0; + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1493,7 +1502,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, spin_unlock_irq(&p->sighand->siglock); return 0; } - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); @@ -1519,89 +1528,137 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (ret <= 0) + return ret; + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(p, options, infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + int ret = wait_consider_task(tsk, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * If we never saw an eligile child, check for children stolen by + * ptrace. We don't leave -ECHILD in *@notask_error if there are any, + * because we will eventually be allowed to wait for them again. + */ + if (!*notask_error) + return 0; + + list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) { + int ret = eligible_child(type, pid, options, p); + if (unlikely(ret < 0)) + return ret; + if (ret) { + *notask_error = 0; + return 0; + } + } + + return 0; +} + static long do_wait(enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; - int flag, retval; + int retval; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: - /* If there is nothing that can match our critier just get out */ + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ retval = -ECHILD; if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) goto end; - /* - * We will set this flag if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - flag = retval = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - int ret = eligible_child(type, pid, options, p); - if (!ret) - continue; - - if (unlikely(ret < 0)) { - retval = ret; - } else if (task_is_stopped_or_traced(p)) { - /* - * It's stopped now, so it might later - * continue, exit, or stop again. - */ - flag = 1; - if (!(p->ptrace & PT_PTRACED) && - !(options & WUNTRACED)) - continue; - - retval = wait_task_stopped(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state == EXIT_ZOMBIE && - !delay_group_leader(p)) { - /* - * We don't reap group leaders with subthreads. - */ - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state != EXIT_DEAD) { - /* - * It's running now, so it might later - * exit, stop, or stop and then continue. - */ - flag = 1; - if (!unlikely(options & WCONTINUED)) - continue; - retval = wait_task_continued(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } - if (retval != 0) /* tasklist_lock released */ - goto end; - } - if (!flag) { - list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - flag = eligible_child(type, pid, options, p); - if (!flag) - continue; - if (likely(flag > 0)) - break; - retval = flag; - goto end; - } + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; } + if (options & __WNOTHREAD) break; tsk = next_thread(tsk); @@ -1609,16 +1666,14 @@ repeat: } while (tsk != current); read_unlock(&tasklist_lock); - if (flag) { - if (options & WNOHANG) - goto end; + if (!retval && !(options & WNOHANG)) { retval = -ERESTARTSYS; - if (signal_pending(current)) - goto end; - schedule(); - goto repeat; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } } - retval = -ECHILD; + end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); -- cgit v1.2.3 From f470021adb9190819c03d6d8c5c860a17480aa6d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 24 Mar 2008 18:36:23 -0700 Subject: ptrace children revamp ptrace no longer fiddles with the children/sibling links, and the old ptrace_children list is gone. Now ptrace, whether of one's own children or another's via PTRACE_ATTACH, just uses the new ptraced list instead. There should be no user-visible difference that matters. The only change is the order in which do_wait() sees multiple stopped children and stopped ptrace attachees. Since wait_task_stopped() was changed earlier so it no longer reorders the children list, we already know this won't cause any new problems. Signed-off-by: Roland McGrath --- kernel/exit.c | 226 +++++++++++++++++++++++++++++--------------------------- kernel/fork.c | 6 +- kernel/ptrace.c | 37 ++++++---- 3 files changed, 146 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7453356a961..1e909826a80 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p) __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - remove_parent(p); + list_del_init(&p->sibling); } /* @@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } +/* + * Do final ptrace-related cleanup of a zombie being reaped. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_release_task(struct task_struct *p) +{ + BUG_ON(!list_empty(&p->ptraced)); + ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_entry)); +} + void release_task(struct task_struct * p) { struct task_struct *leader; @@ -160,8 +172,7 @@ repeat: atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + ptrace_release_task(p); __exit_signal(p); /* @@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void) ptrace_unlink(current); /* Reparent to init */ - remove_parent(current); current->real_parent = current->parent = kthreadd_task; - add_parent(current); + list_move_tail(¤t->sibling, ¤t->real_parent->children); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -692,37 +702,71 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static void -reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) { - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + struct task_struct *p, *n; - /* Move the child from its dying parent to the new one. */ - if (unlikely(traced)) { - /* Preserve ptrace links if someone else is tracing this child. */ - list_del_init(&p->ptrace_list); - if (ptrace_reparented(p)) - list_add(&p->ptrace_list, &p->real_parent->ptrace_children); - } else { - /* If this child is being traced, then we're the one tracing it - * anyway, so let go of it. + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. */ - p->ptrace = 0; - remove_parent(p); - p->parent = p->real_parent; - add_parent(p); + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + } - if (task_is_traced(p)) { + if (task_detached(p)) { /* - * If it was at a trace stop, turn it into - * a normal stop since it's no longer being - * traced. + * Mark it as in the process of being reaped. */ - ptrace_untrace(p); + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); } } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); /* If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -737,7 +781,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) /* If we'd notified the old parent about this child's death, * also notify the new parent. */ - if (!traced && p->exit_state == EXIT_ZOMBIE && + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); @@ -754,12 +799,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) static void forget_original_parent(struct task_struct *father) { struct task_struct *p, *n, *reaper = father; - struct list_head ptrace_dead; - - INIT_LIST_HEAD(&ptrace_dead); + LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + do { reaper = next_thread(reaper); if (reaper == father) { @@ -768,58 +816,19 @@ static void forget_original_parent(struct task_struct *father) } } while (reaper->flags & PF_EXITING); - /* - * There are only two places where our children can be: - * - * - in our child list - * - in our ptraced child list - * - * Search them and reparent children. - */ list_for_each_entry_safe(p, n, &father->children, sibling) { - int ptrace; - - ptrace = p->ptrace; - - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); - - if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - p->real_parent = reaper; - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ - __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - } - - /* - * if the ptraced child is a detached zombie we must collect - * it before we exit, or it will remain zombie forever since - * we prevented it from self-reap itself while it was being - * traced by us, to be able to see it in wait4. - */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) - list_add(&p->ptrace_list, &ptrace_dead); - } - - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { p->real_parent = reaper; - reparent_thread(p, father, 1); + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); } write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&father->children)); - BUG_ON(!list_empty(&father->ptrace_children)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { - list_del_init(&p->ptrace_list); - release_task(p); - } + ptrace_exit_finish(father, &ptrace_dead); } /* @@ -1180,13 +1189,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; } - /* - * Do not consider detached threads that are - * not ptraced: - */ - if (task_detached(p) && !p->ptrace) - return 0; - /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1399,7 +1401,7 @@ static int wait_task_zombie(struct task_struct *p, int options, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, +static int wait_task_stopped(int ptrace, struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1407,7 +1409,7 @@ static int wait_task_stopped(struct task_struct *p, uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; - if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED)) + if (!(options & WUNTRACED)) return 0; exit_code = 0; @@ -1416,7 +1418,7 @@ static int wait_task_stopped(struct task_struct *p, if (unlikely(!task_is_stopped_or_traced(p))) goto unlock_sig; - if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) + if (!ptrace && p->signal->group_stop_count > 0) /* * A group stop is in progress and this is the group leader. * We won't report until all threads have stopped. @@ -1445,7 +1447,7 @@ unlock_sig: */ get_task_struct(p); pid = task_pid_vnr(p); - why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); if (unlikely(options & WNOWAIT)) @@ -1536,7 +1538,7 @@ static int wait_task_continued(struct task_struct *p, int options, * Returns zero if the search for a child should continue; * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. */ -static int wait_consider_task(struct task_struct *parent, +static int wait_consider_task(struct task_struct *parent, int ptrace, struct task_struct *p, int *notask_error, enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, @@ -1546,6 +1548,15 @@ static int wait_consider_task(struct task_struct *parent, if (ret <= 0) return ret; + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + if (p->exit_state == EXIT_DEAD) return 0; @@ -1562,7 +1573,8 @@ static int wait_consider_task(struct task_struct *parent, *notask_error = 0; if (task_is_stopped_or_traced(p)) - return wait_task_stopped(p, options, infop, stat_addr, ru); + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); return wait_task_continued(p, options, infop, stat_addr, ru); } @@ -1583,11 +1595,16 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error, struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - int ret = wait_consider_task(tsk, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } } return 0; @@ -1601,21 +1618,16 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, struct task_struct *p; /* - * If we never saw an eligile child, check for children stolen by - * ptrace. We don't leave -ECHILD in *@notask_error if there are any, - * because we will eventually be allowed to wait for them again. + * Traditionally we see ptrace'd stopped tasks regardless of options. */ - if (!*notask_error) - return 0; + options |= WUNTRACED; - list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) { - int ret = eligible_child(type, pid, options, p); - if (unlikely(ret < 0)) + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) return ret; - if (ret) { - *notask_error = 0; - return 0; - } } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 4bd2f516401..adefc1131f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); + INIT_LIST_HEAD(&p->ptrace_entry); + INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - add_parent(p); + list_add_tail(&p->sibling, &p->real_parent->children); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e337390fce0..8392a9da645 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -33,13 +33,9 @@ */ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_list)); - if (child->parent == new_parent) - return; - list_add(&child->ptrace_list, &child->parent->ptrace_children); - remove_parent(child); + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; - add_parent(child); } /* @@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); child->ptrace = 0; - if (ptrace_reparented(child)) { - list_del_init(&child->ptrace_list); - remove_parent(child); - child->parent = child->real_parent; - add_parent(child); - } + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); if (task_is_traced(child)) ptrace_untrace(child); @@ -492,15 +484,34 @@ int ptrace_traceme(void) /* * Are we already being traced? */ +repeat: task_lock(current); if (!(current->ptrace & PT_PTRACED)) { + /* + * See ptrace_attach() comments about the locking here. + */ + unsigned long flags; + if (!write_trylock_irqsave(&tasklist_lock, flags)) { + task_unlock(current); + do { + cpu_relax(); + } while (!write_can_lock(&tasklist_lock)); + goto repeat; + } + ret = security_ptrace(current->parent, current, PTRACE_MODE_ATTACH); + /* * Set the ptrace bit in the process ptrace flags. + * Then link us on our parent's ptraced list. */ - if (!ret) + if (!ret) { current->ptrace |= PT_PTRACED; + __ptrace_link(current, current->real_parent); + } + + write_unlock_irqrestore(&tasklist_lock, flags); } task_unlock(current); return ret; -- cgit v1.2.3 From 14dd0b81414a58caf0296dbeace016bb0a5d11ab Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 30 Mar 2008 18:41:25 -0700 Subject: do_wait: return security_task_wait() error code in place of -ECHILD This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3 "do_wait: fix security checks". That change reverted the effect of commit 73243284463a761e04d69d22c7516b2be7de096c. The rationale for the original commit still stands. The inconsistent treatment of children hidden by ptrace was an unintended omission in the original change and in no way invalidates its purpose. This makes do_wait return the error returned by security_task_wait() (usually -EACCES) in place of -ECHILD when there are some children the caller would be able to wait for if not for the permission failure. A permission error will give the user a clue to look for security policy problems, rather than for mysterious wait bugs. Signed-off-by: Roland McGrath --- kernel/exit.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1e909826a80..a2af6cac823 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1199,14 +1199,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; err = security_task_wait(p); - if (likely(!err)) - return 1; + if (err) + return err; - if (type != PIDTYPE_PID) - return 0; - /* This child was explicitly requested, abort */ - read_unlock(&tasklist_lock); - return err; + return 1; } static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, @@ -1536,7 +1532,8 @@ static int wait_task_continued(struct task_struct *p, int options, * -ECHILD should be in *@notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; - * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. */ static int wait_consider_task(struct task_struct *parent, int ptrace, struct task_struct *p, int *notask_error, @@ -1545,9 +1542,21 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, int __user *stat_addr, struct rusage __user *ru) { int ret = eligible_child(type, pid, options, p); - if (ret <= 0) + if (!ret) return ret; + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + if (likely(!ptrace) && unlikely(p->ptrace)) { /* * This child is hidden by ptrace. @@ -1585,7 +1594,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, * -ECHILD should be in *@notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then - * *@notask_error is 0 if there were any eligible children, or still -ECHILD. + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. */ static int do_wait_thread(struct task_struct *tsk, int *notask_error, enum pid_type type, struct pid *pid, int options, -- cgit v1.2.3 From 666f164f4fbfa78bd00fb4b74788b42a39842c64 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 8 Apr 2008 23:12:30 -0700 Subject: fix dangling zombie when new parent ignores children This fixes an arcane bug that we think was a regression introduced by commit b2b2cbc4b2a2f389442549399a993a8306420baf. When a parent ignores SIGCHLD (or uses SA_NOCLDWAIT), its children would self-reap but they don't if it's using ptrace on them. When the parent thread later exits and ceases to ptrace a child but leaves other live threads in the parent's thread group, any zombie children are left dangling. The fix makes them self-reap then, as they would have done earlier if ptrace had not been in use. Signed-off-by: Roland McGrath --- kernel/exit.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a2af6cac823..93d2711b938 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -702,6 +702,23 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) +{ + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} + /* * Detach all tasks we were using ptrace on. * Any that need to be release_task'd are put on the @dead list. @@ -711,6 +728,7 @@ static void exit_mm(struct task_struct * tsk) static void ptrace_exit(struct task_struct *parent, struct list_head *dead) { struct task_struct *p, *n; + int ign = -1; list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { __ptrace_unlink(p); @@ -726,10 +744,18 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead) * release_task() here because we already hold tasklist_lock. * * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. */ if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, parent)) do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } } if (task_detached(p)) { -- cgit v1.2.3 From c349e0a01c3e0f70913db6a5bb61ab204e0602de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Apr 2008 22:39:31 +0200 Subject: ftrace: do not trace scheduler functions do not trace scheduler functions - it's still a bit fragile and can lock up with: http://redhat.com/~mingo/misc/config-Thu_Jul_17_13_34_52_CEST_2008 Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a7ed838984..985ddb7da4d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v1.2.3