From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 09:06:56 +0200 Subject: perf_counter: Stop open coding unclone_ctx Instead of open coding the unclone context thingy, put it in a common function. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da234..8bf997d86bf 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx) } } +static void unclone_ctx(struct perf_counter_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task) /* * Unclone this context if we enabled any counter. */ - if (enabled && ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } + if (enabled) + unclone_ctx(ctx); spin_unlock(&ctx->lock); @@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *parent_ctx; struct perf_counter_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; @@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { - parent_ctx = ctx->parent_ctx; - if (parent_ctx) { - put_ctx(parent_ctx); - ctx->parent_ctx = NULL; /* no longer a clone */ - } + unclone_ctx(ctx); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child) */ spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; - if (child_ctx->parent_ctx) { - /* - * This context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - put_ctx(child_ctx->parent_ctx); - child_ctx->parent_ctx = NULL; - } + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + unclone_ctx(child_ctx); spin_unlock(&child_ctx->lock); local_irq_restore(flags); -- cgit v1.2.3 From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 6 Jul 2009 09:31:33 +0100 Subject: perf_counter: Fix the tracepoint channel to perfcounters Fix a missed rename in EVENT_PROFILE support so that it gets built and allows tracepoint tracing from the 'perf' tool. Fix a typo in the (never before built & enabled) portion in perf_counter.c as well, and update that code to the attr.config changes as well. Signed-off-by: Chris Wilson Cc: Ben Gamari Cc: Jason Baron Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d55a50da234..c6c38fb7766 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id) { struct perf_sample_data data = { - .regs = get_irq_regs(); + .regs = get_irq_regs(), .addr = 0, }; @@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->attr)); + ftrace_profile_disable(counter->attr.config); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->attr); - int ret; - - ret = ftrace_profile_enable(event_id); - if (ret) + if (ftrace_profile_enable(counter->attr.config)) return NULL; counter->destroy = tp_perf_counter_destroy; -- cgit v1.2.3 From 54fdc5816631b43ba55fc3206d7add2d85850bc6 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Thu, 16 Jul 2009 12:32:27 +0200 Subject: sched: Account for vruntime wrapping I spotted two sites that didn't take vruntime wrap-around into account. Fix these by creating a comparison helper that does do so. Signed-off-by: Fabio Checconi Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7c248dc30f4..9ffb2b2ceba 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return se->vruntime - cfs_rq->min_vruntime; @@ -1017,7 +1023,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || entity_before(rightmost, se))) return; /* @@ -1713,7 +1719,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && curr->vruntime < se->vruntime) { + curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. -- cgit v1.2.3 From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:15:52 +0200 Subject: perf_counter: Make sure we dont leak kernel memory to userspace There are a few places we are leaking tiny amounts of kernel memory to userspace. This happens when writing out strings because we always align the end to 64 bits. To avoid this we should always use an appropriately sized temporary buffer and ensure it is zeroed. Since d_path assembles the string from the end of the buffer backwards, we need to add 64 bits after the buffer to allow for alignment. We also need to copy arch_vma_name to the temporary buffer, because if we use it directly we may end up copying to userspace a number of bytes after the end of the string constant. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.273972048@samba.org> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c6c38fb7766..f7a8ab9576e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; unsigned int size; - char *comm = comm_event->task->comm; + char comm[TASK_COMM_LEN]; + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; const char *name; + memset(tmp, 0, sizeof(tmp)); + if (file) { - buf = kzalloc(PATH_MAX, GFP_KERNEL); + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); if (!buf) { name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; @@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { - name = arch_vma_name(mmap_event->vma); - if (name) + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); goto got_name; + } if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); -- cgit v1.2.3 From ed900c054b541254f0ce5cedaf75206e29bd614e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: perf_counter: Log vfork as a fork event Right now we don't output vfork events. Even though we should always see an exec after a vfork, we may get perfcounter samples between the vfork and exec. These samples can lead to some confusion when parsing perfcounter data. To keep things consistent we should always log a fork event. It will result in a little more log data, but is less confusing to trace parsing tools. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090716104817.589309391@samba.org> Signed-off-by: Ingo Molnar --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0a..4812d60b29f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1408,14 +1408,11 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else if (!(clone_flags & CLONE_VM)) { - /* - * vfork will do an exec which will call - * set_task_comm() - */ - perf_counter_fork(p); } + if (!(clone_flags & CLONE_THREAD)) + perf_counter_fork(p); + audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); -- cgit v1.2.3 From a468d389349a7560249b355cdb6d2097ea1616c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:46 +0200 Subject: sched: fix load average accounting vs. cpu hotplug The new load average code clears rq->calc_load_active on CPU_ONLINE. That's wrong as the new onlined CPU might have got a scheduler tick already and accounted the delta to the stale value of the time we offlined the CPU. Clear the value when we cleanup the dead CPU instead. Also move the update of the calc_load_update time for the newly online CPU to CPU_UP_PREPARE to avoid that the CPU plays catch up with the stale update time value. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98972d366fd..1b59e265273 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7289,6 +7289,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static void calc_global_load_remove(struct rq *rq) { atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -7515,6 +7516,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) task_rq_unlock(rq, &flags); get_task_struct(p); cpu_rq(cpu)->migration_thread = p; + rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: @@ -7525,8 +7527,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); - rq->calc_load_update = calc_load_update; - rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- cgit v1.2.3 From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- kernel/freezer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936cf708..bd1d42b17cb 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; schedule(); } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } -- cgit v1.2.3 From 4841158b26e28e1476eed84c7347c18f11317750 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Sat, 18 Jul 2009 16:46:02 -0400 Subject: timer: Avoid reading uninitialized data timer->expires may be uninitialized, so check timer_pending() before touching timer->expires to pacify kmemcheck. Signed-off-by: Pavel Roskin LKML-Reference: <20090718204602.5191.360.stgit@mj.roinet.com> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8..a7f07d5a624 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -714,7 +714,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) * networking code - if the timer is re-modified * to be the same thing then just return: */ - if (timer->expires == expires && timer_pending(timer)) + if (timer_pending(timer) && timer->expires == expires) return 1; return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -- cgit v1.2.3 From 79ef2bb01445400def20c7993b27fbcad27ca95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jul 2009 17:09:12 +0200 Subject: clocksource: Prevent NULL pointer dereference Writing a zero length string to sys/.../current_clocksource will cause a NULL pointer dereference if the clock events system is in one shot (highres or nohz) mode. Pointed-out-by: Dan Carpenter LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 592bf584d1d..7466cb81125 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -513,7 +513,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Check to make sure we don't switch to a non-highres capable * clocksource if the tick code is in oneshot mode (highres or nohz) */ - if (tick_oneshot_mode_active() && + if (tick_oneshot_mode_active() && ovr && !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { printk(KERN_WARNING "%s clocksource is not HRT compatible. " "Cannot switch while in HRT/NOHZ mode\n", ovr->name); -- cgit v1.2.3 From 591d2fb02ea80472d846c0b8507007806bdd69cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 11:09:39 +0200 Subject: genirq: Delegate irq affinity setting to the irq thread irq_set_thread_affinity() calls set_cpus_allowed_ptr() which might sleep, but irq_set_thread_affinity() is called with desc->lock held and can be called from hard interrupt context as well. The code has another bug as it does not hold a ref on the task struct as required by set_cpus_allowed_ptr(). Just set the IRQTF_AFFINITY bit in action->thread_flags. The next time the thread runs it migrates itself. Solves all of the above problems nicely. Add kerneldoc to irq_set_thread_affinity() while at it. Signed-off-by: Thomas Gleixner LKML-Reference: --- kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/irq/migration.c | 2 +- 3 files changed, 46 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 73468253143..e70ed5592eb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc); /* * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da6767290..f0de36f13a4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action = desc->action; while (action) { if (action->thread) - set_cpus_allowed_ptr(action->thread, cpumask); + set_bit(IRQTF_AFFINITY, &action->thread_flags); action = action->next; } } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (desc->status & IRQ_MOVE_PCNTXT) { if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } } else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #else if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } #endif desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) - irq_set_thread_affinity(desc, desc->affinity); + irq_set_thread_affinity(desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -443,6 +451,34 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->affinity); + spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} + /* * Interrupt handler thread */ @@ -458,6 +494,8 @@ static int irq_thread(void *data) while (!irq_wait_for_interrupt(action)) { + irq_thread_check_affinity(desc, action); + atomic_inc(&desc->threads_active); spin_lock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767ca154..fcb6c96f262 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq) < nr_cpu_ids)) if (!desc->chip->set_affinity(irq, desc->pending_mask)) { cpumask_copy(desc->affinity, desc->pending_mask); - irq_set_thread_affinity(desc, desc->pending_mask); + irq_set_thread_affinity(desc); } cpumask_clear(desc->pending_mask); -- cgit v1.2.3 From 9ba5f005c994ad28e266a0cd14ef29354be382c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 14:18:35 +0200 Subject: softirq: introduce tasklet_hrtimer infrastructure commit ca109491f (hrtimer: removing all ur callback modes) moved all hrtimer callbacks into hard interrupt context when high resolution timers are active. That breaks code which relied on the assumption that the callback happens in softirq context. Provide a generic infrastructure which combines tasklets and hrtimers together to provide an in-softirq hrtimer experience. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: kaber@trash.net Cc: David Miller LKML-Reference: <1248265724.27058.1366.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905fa5d..eb5e131a048 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) softirq_vec[nr].action = action; } -/* Tasklets */ +/* + * Tasklets + */ struct tasklet_head { struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + if (hrtimer_is_hres_active(timer)) { + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; + } + return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); EXPORT_PER_CPU_SYMBOL(softirq_work_list); -- cgit v1.2.3 From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: Fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5c6fae4f43d..ff854fd89a8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 11:13:50 +0200 Subject: perf_counter: Plug more stack leaks Per example of Arjan's patch, I went through and found a few more. Signed-off-by: Peter Zijlstra --- kernel/perf_counter.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff854fd89a8..e1d6a3aa133 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task) .event = { .header = { .type = PERF_EVENT_FORK, + .misc = 0, .size = sizeof(fork_event.event), }, + /* .pid */ + /* .ppid */ }, }; @@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task) comm_event = (struct perf_comm_event){ .task = task, + /* .comm */ + /* .comm_size */ .event = { - .header = { .type = PERF_EVENT_COMM, }, + .header = { + .type = PERF_EVENT_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ }, }; @@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma) mmap_event = (struct perf_mmap_event){ .vma = vma, + /* .file_name */ + /* .file_size */ .event = { - .header = { .type = PERF_EVENT_MMAP, }, + .header = { + .type = PERF_EVENT_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = vma->vm_pgoff, -- cgit v1.2.3 From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:40 +0200 Subject: perf_counter: PERF_SAMPLE_ID and inherited counters Anton noted that for inherited counters the counter-id as provided by PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID because each inherited counter gets its own id. His suggestion was to always return the parent counter id, since that is the primary counter id as exposed. However, these inherited counters have a unique identifier so that events like PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which counter gets modified, which is important when trying to normalize the sample streams. This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD, which is more useful anyway, since changing periods became a lot more common than initially thought -- rendering PERF_EVENT_PERIOD the less useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate value, since it reports the value used to trigger the overflow, whereas PERF_EVENT_PERIOD simply reports the requested period changed, which might only take effect on the next cycle). This still leaves us PERF_EVENT_THROTTLE to consider, but since that _should_ be a rare occurrence, and linking it to a primary id is the most useful bit to diagnose the problem, we introduce a PERF_SAMPLE_STREAM_ID, for those few cases where the full reconstruction is important. [Does change the ABI a little, but I see no other way out] Suggested-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <1248095846.15751.8781.camel@twins> --- kernel/perf_counter.c | 92 +++++++++++++++++---------------------------------- 1 file changed, 31 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e1d6a3aa133..7530588fa5c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx) } } +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_period(struct perf_counter *counter, u64 events) { @@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) if (!sample_period) sample_period = 1; - perf_log_period(counter, sample_period); - hwc->sample_period = sample_period; } @@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = counter->id; + values[n++] = primary_counter_id(counter); mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { - perf_log_period(counter, value); - counter->attr.sample_period = value; counter->hw.sample_period = value; } @@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); @@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, data->addr); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); + + perf_output_put(&handle, id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) @@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sub != counter) sub->pmu->read(sub); - group_entry.id = sub->id; + group_entry.id = primary_counter_id(sub); group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter, } if (counter->attr.read_format & PERF_FORMAT_ID) { - u64 id; - event.header.size += sizeof(u64); - if (counter->parent) - id = counter->parent->id; - else - id = counter->id; - - event.format[i++] = id; + event.format[i++] = primary_counter_id(counter); } ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); @@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma) perf_counter_mmap_event(&mmap_event); } -/* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - struct freq_event event; - int ret; - - if (counter->hw.sample_period == period) - return; - - if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) - return; - - event = (struct freq_event) { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(event), - }, - .time = sched_clock(), - .id = counter->id, - .period = period, - }; - - ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_end(&handle); -} - /* * IRQ throttle logging */ @@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct perf_event_header header; u64 time; u64 id; + u64 stream_id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), - .id = counter->id, + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); -- cgit v1.2.3 From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 22 Jul 2009 23:05:46 +1000 Subject: perf_counter: Fix throttle/unthrottle event logging Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE). Fix this to print both a throttle and unthrottle event. Signed-off-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <20090722130546.GE9029@kryten> --- kernel/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7530588fa5c..787d4daef18 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE + 1, + .type = PERF_EVENT_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, @@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .stream_id = counter->id, }; + if (enable) + throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; -- cgit v1.2.3 From 0dc3d523e8bc4718e0be2e4a742367d6e4be77cd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 21 Jul 2009 00:55:05 -0700 Subject: perf: fix stack data leak the "reserved" field was not initialized to zero, resulting in 4 bytes of stack data leaking to userspace.... Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a641eb753b8..7bc888dfd06 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2665,6 +2665,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); + cpu_entry.reserved = 0; } if (sample_type & PERF_SAMPLE_PERIOD) -- cgit v1.2.3 From 61f3826133dc07142935fb5712fc738e19eb5575 Mon Sep 17 00:00:00 2001 From: Bruno Premont Date: Wed, 22 Jul 2009 22:22:32 +0200 Subject: genirq: Fix UP compile failure caused by irq_thread_check_affinity Since genirq: Delegate irq affinity setting to the irq thread (591d2fb02ea80472d846c0b8507007806bdd69cc) compilation with CONFIG_SMP=n fails with following error: /usr/src/linux-2.6/kernel/irq/manage.c: In function 'irq_thread_check_affinity': /usr/src/linux-2.6/kernel/irq/manage.c:475: error: 'struct irq_desc' has no member named 'affinity' make[4]: *** [kernel/irq/manage.o] Error 1 That commit adds a new function irq_thread_check_affinity() which uses struct irq_desc.affinity which is only available for CONFIG_SMP=y. Move that function under #ifdef CONFIG_SMP. [ tglx@brownpaperbag: compile and boot tested on UP and SMP ] Signed-off-by: Bruno Premont LKML-Reference: <20090722222232.2eb3e1c4@neptune.home> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0de36f13a4..61c679db468 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -451,6 +451,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +#ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ @@ -478,6 +479,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif /* * Interrupt handler thread -- cgit v1.2.3 From 6560dc160f3a96b8f1f43e2c6b51aa6eb9898b90 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 23 Jul 2009 23:42:08 +0930 Subject: module: use MODULE_SYMBOL_PREFIX with module_layout The check_modstruct_version() needs to look up the symbol "module_layout" in the kernel, but it does so literally and not by a C identifier. The trouble is that it does not include a symbol prefix for those ports that need it (like the Blackfin and H8300 port). So make sure we tack on the MODULE_SYMBOL_PREFIX define to the front of it. Signed-off-by: Mike Frysinger Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0a049837008..fd141140355 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1068,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; - if (!find_symbol("module_layout", NULL, &crc, true, false)) + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) BUG(); return check_version(sechdrs, versindex, "module_layout", mod, crc); } -- cgit v1.2.3 From 9ae260270c90643156cda73427aa1f04c923e627 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jun 2009 02:51:13 +0200 Subject: update the comment in kthread_stop() Commit 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 ("kthreads: rework kthread_stop()") removed the limitation that the thread function mysr not call do_exit() itself, but forgot to update the comment. Since that commit it is OK to use kthread_stop() even if kthread can exit itself. Signed-off-by: Oleg Nesterov Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/kthread.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9b1a7de2697..eb8751aa041 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,10 +180,12 @@ EXPORT_SYMBOL(kthread_bind); * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. -- cgit v1.2.3 From 38ceb592fcac9110c6b3c87ea0a27bff68c43486 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:11:24 +0800 Subject: tracing: Fix invalid function_graph entry When print_graph_entry() computes a function call entry event, it needs to also check the next entry to guess if it matches the return event of the current function entry. In order to look at this next event, it needs to consume the current entry before going ahead in the ring buffer. However, if the current event that gets consumed is the last one in the ring buffer head page, the ring_buffer may reuse the page for writers. The consumed entry will then become invalid because of possible racy overwriting. Me must then handle this entry by making a copy of it. The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEAEC.3050508@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..420ec348757 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -843,9 +843,16 @@ print_graph_function(struct trace_iterator *iter) switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *field, saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; -- cgit v1.2.3 From 74e7ff8c50b6b022e6ffaa736b16a4dc161d3eaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:17:22 +0800 Subject: tracing: Fix missing function_graph events when we splice_read from trace_pipe About a half events are missing when we splice_read from trace_pipe. They are unexpectedly consumed because we ignore the TRACE_TYPE_NO_CONSUME return value used by the function graph tracer when it needs to consume the events by itself to walk on the ring buffer. The same problem appears with ftrace_dump() Example of an output before this patch: 1) | ktime_get_real() { 1) 2.846 us | read_hpet(); 1) 4.558 us | } 1) 6.195 us | } After this patch: 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 1.960 us | read_hpet(); 0) 3.597 us | } 0) 5.196 us | } The fix also applies on 2.6.30 Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: stable@kernel.org LKML-Reference: <4A6EEC52.90704@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bc8d8afea6..da984ad065a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3085,7 +3085,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; if (!find_next_entry_inc(iter)) { rem = 0; @@ -4233,8 +4234,11 @@ static void __ftrace_dump(bool disable_tracing) iter.pos = -1; if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); -- cgit v1.2.3