From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/fork.c | 1 + kernel/perf_counter.c | 943 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched.c | 24 ++ kernel/sys_ni.c | 3 + 5 files changed, 972 insertions(+) create mode 100644 kernel/perf_counter.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 19fad003b19..1f184a1dc40 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/fork.c b/kernel/fork.c index 2a372a0e206..441fadff1fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; rt_mutex_init_task(p); + perf_counter_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 00000000000..20508f05365 --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,943 @@ +/* + * Performance counter core code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +/* + * Mutex for (sysadmin-configurable) counter reservations: + */ +static DEFINE_MUTEX(perf_resource_mutex); + +/* + * Architecture provided APIs - weak aliases: + */ + +int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +{ + return -EINVAL; +} + +void __weak hw_perf_counter_enable(struct perf_counter *counter) { } +void __weak hw_perf_counter_disable(struct perf_counter *counter) { } +void __weak hw_perf_counter_read(struct perf_counter *counter) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } + +#if BITS_PER_LONG == 64 + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 64 bit version - no complications. + */ +static inline u64 perf_read_counter_safe(struct perf_counter *counter) +{ + return (u64) atomic64_read(&counter->count); +} + +#else + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 32 bit version. + */ +static u64 perf_read_counter_safe(struct perf_counter *counter) +{ + u32 cntl, cnth; + + local_irq_disable(); + do { + cnth = atomic_read(&counter->count32[1]); + cntl = atomic_read(&counter->count32[0]); + } while (cnth != atomic_read(&counter->count32[1])); + + local_irq_enable(); + + return cntl | ((u64) cnth) << 32; +} + +#endif + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + ctx->nr_active--; + cpuctx->active_oncpu--; + counter->task = NULL; + } + ctx->nr_counters--; + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_del_init(&counter->list); + hw_perf_enable_all(); + + if (!ctx->task) { + /* + * Allow more per task counters with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_counters - ctx->nr_counters, + perf_max_counters - perf_reserved_percpu); + } + + spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with counter->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + */ +static void perf_remove_from_context(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu counters are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(counter->cpu, + __perf_remove_from_context, + counter, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_remove_from_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the counter safely, if it the call above did not + * succeed. + */ + if (!list_empty(&counter->list)) { + ctx->nr_counters--; + list_del_init(&counter->list); + counter->task = NULL; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to install and enable a preformance counter + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + int cpu = smp_processor_id(); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_add_tail(&counter->list, &ctx->counters); + hw_perf_enable_all(); + + ctx->nr_counters++; + + if (cpuctx->active_oncpu < perf_max_counters) { + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + + if (!ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, + struct perf_counter *counter, + int cpu) +{ + struct task_struct *task = ctx->task; + + counter->ctx = ctx; + if (!task) { + /* + * Per cpu counters are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + counter, 1); + return; + } + + counter->task = task; +retry: + task_oncpu_function_call(task, __perf_install_in_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active and the counter has not been added + * we need to retry the smp call. + */ + if (ctx->nr_active && list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the counter safely, if it the call above did not + * succeed. + */ + if (list_empty(&counter->list)) { + list_add_tail(&counter->list, &ctx->counters); + ctx->nr_counters++; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but hw_perf_counter_disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!cpuctx->task_ctx)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (!ctx->nr_active) + break; + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + ctx->nr_active--; + cpuctx->active_oncpu--; + } + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = NULL; +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but hw_perf_counter_enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (ctx->nr_active == cpuctx->max_pertask) + break; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Rotate the first entry last: + */ + hw_perf_disable_all(); + list_for_each_entry(counter, &ctx->counters, list) { + list_del(&counter->list); + list_add_tail(&counter->list, &ctx->counters); + break; + } + hw_perf_enable_all(); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = &task->perf_counter_ctx; + + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counters); + ctx->nr_counters = 0; + ctx->task = task; +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __hw_perf_counter_read(void *info) +{ + hw_perf_counter_read(info); +} + +static u64 perf_read_counter(struct perf_counter *counter) +{ + /* + * If counter is enabled and currently active on a CPU, update the + * value in the counter structure: + */ + if (counter->active) { + smp_call_function_single(counter->oncpu, + __hw_perf_counter_read, counter, 1); + } + + return perf_read_counter_safe(counter); +} + +/* + * Cross CPU call to switch performance data pointers + */ +static void __perf_switch_irq_data(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task) { + if (cpuctx->task_ctx != ctx) + return; + spin_lock(&ctx->lock); + } + + /* Change the pointer NMI safe */ + atomic_long_set((atomic_long_t *)&counter->irqdata, + (unsigned long) counter->usrdata); + counter->usrdata = oldirqdata; + + if (ctx->task) + spin_unlock(&ctx->lock); +} + +static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + struct task_struct *task = ctx->task; + + if (!task) { + smp_call_function_single(counter->cpu, + __perf_switch_irq_data, + counter, 1); + return counter->usrdata; + } + +retry: + spin_lock_irq(&ctx->lock); + if (!counter->active) { + counter->irqdata = counter->usrdata; + counter->usrdata = oldirqdata; + spin_unlock_irq(&ctx->lock); + return oldirqdata; + } + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_switch_irq_data, counter); + /* Might have failed, because task was scheduled out */ + if (counter->irqdata == oldirqdata) + goto retry; + + return counter->usrdata; +} + +static void put_context(struct perf_counter_context *ctx) +{ + if (ctx->task) + put_task_struct(ctx->task); +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * If cpu is not a wildcard then this is a percpu counter: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU counter: */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a counter to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + + WARN_ON_ONCE(ctx->task); + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + ctx = &task->perf_counter_ctx; + ctx->task = task; + + /* Reuse ptrace permission checks for now. */ + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + put_context(ctx); + return ERR_PTR(-EACCES); + } + + return ctx; +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_counter *counter = file->private_data; + struct perf_counter_context *ctx = counter->ctx; + + file->private_data = NULL; + + mutex_lock(&counter->mutex); + + perf_remove_from_context(counter); + put_context(ctx); + + mutex_unlock(&counter->mutex); + + kfree(counter); + + return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ + u64 cntval; + + if (count != sizeof(cntval)) + return -EINVAL; + + mutex_lock(&counter->mutex); + cntval = perf_read_counter(counter); + mutex_unlock(&counter->mutex); + + return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); +} + +static ssize_t +perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) +{ + if (!usrdata->len) + return 0; + + count = min(count, (size_t)usrdata->len); + if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) + return -EFAULT; + + /* Adjust the counters */ + usrdata->len -= count; + if (!usrdata->len) + usrdata->rd_idx = 0; + else + usrdata->rd_idx += count; + + return count; +} + +static ssize_t +perf_read_irq_data(struct perf_counter *counter, + char __user *buf, + size_t count, + int nonblocking) +{ + struct perf_data *irqdata, *usrdata; + DECLARE_WAITQUEUE(wait, current); + ssize_t res; + + irqdata = counter->irqdata; + usrdata = counter->usrdata; + + if (usrdata->len + irqdata->len >= count) + goto read_pending; + + if (nonblocking) + return -EAGAIN; + + spin_lock_irq(&counter->waitq.lock); + __add_wait_queue(&counter->waitq, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (usrdata->len + irqdata->len >= count) + break; + + if (signal_pending(current)) + break; + + spin_unlock_irq(&counter->waitq.lock); + schedule(); + spin_lock_irq(&counter->waitq.lock); + } + __remove_wait_queue(&counter->waitq, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&counter->waitq.lock); + + if (usrdata->len + irqdata->len < count) + return -ERESTARTSYS; +read_pending: + mutex_lock(&counter->mutex); + + /* Drain pending data first: */ + res = perf_copy_usrdata(usrdata, buf, count); + if (res < 0 || res == count) + goto out; + + /* Switch irq buffer: */ + usrdata = perf_switch_irq_data(counter); + if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + if (!res) + res = -EFAULT; + } else { + res = count; + } +out: + mutex_unlock(&counter->mutex); + + return res; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_counter *counter = file->private_data; + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + return perf_read_hw(counter, buf, count); + + case PERF_RECORD_IRQ: + case PERF_RECORD_GROUP: + return perf_read_irq_data(counter, buf, count, + file->f_flags & O_NONBLOCK); + } + return -EINVAL; +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_counter *counter = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &counter->waitq, wait); + + spin_lock_irqsave(&counter->waitq.lock, flags); + if (counter->usrdata->len || counter->irqdata->len) + events |= POLLIN; + spin_unlock_irqrestore(&counter->waitq.lock, flags); + + return events; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, +}; + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +{ + struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + + if (!counter) + return NULL; + + mutex_init(&counter->mutex); + INIT_LIST_HEAD(&counter->list); + init_waitqueue_head(&counter->waitq); + + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->record_type = record_type; + counter->__irq_period = hw_event_period; + counter->wakeup_pending = 0; + + return counter; +} + +/** + * sys_perf_task_open - open a performance counter associate it to a task + * @hw_event_type: event type for monitoring/sampling... + * @pid: target pid + */ +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + int ret; + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = -ENOMEM; + counter = perf_counter_alloc(hw_event_period, cpu, record_type); + if (!counter) + goto err_put_context; + + ret = hw_perf_counter_init(counter, hw_event_type); + if (ret) + goto err_free_put_context; + + perf_install_in_context(ctx, counter, cpu); + + ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + if (ret < 0) + goto err_remove_free_put_context; + + return ret; + +err_remove_free_put_context: + mutex_lock(&counter->mutex); + perf_remove_from_context(counter); + mutex_unlock(&counter->mutex); + +err_free_put_context: + kfree(counter); + +err_put_context: + put_context(ctx); + + return ret; +} + +static void __cpuinit perf_init_cpu(int cpu) +{ + struct perf_cpu_context *ctx; + + ctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_init(&ctx->ctx.lock); + INIT_LIST_HEAD(&ctx->ctx.counters); + + mutex_lock(&perf_resource_mutex); + ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_counter *counter, *tmp; + + list_for_each_entry_safe(counter, tmp, &ctx->counters, list) + __perf_remove_from_context(counter); + +} +static void perf_exit_cpu(int cpu) +{ + smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); +} +#else +static inline void perf_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_init_cpu(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, +}; + +static int __init perf_counter_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); + + return 0; +} +early_initcall(perf_counter_init); + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_counters) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, + perf_max_counters - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_overcommit = val; + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); + diff --git a/kernel/sched.c b/kernel/sched.c index b7480fb5c3d..254d56de254 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); + perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (current->sched_class->post_schedule) @@ -4296,6 +4319,7 @@ void scheduler_tick(void) rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a2328170..4be8bbc7577 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open); -- cgit v1.2.3 From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:26:59 +0100 Subject: perf counters: clean up 'raw' type API Impact: cleanup Introduce a separate hw_event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 20508f05365..96c333a5b0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) * @pid: target pid */ asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu) +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd) { struct perf_counter_context *ctx; + struct perf_counter_event event; struct perf_counter *counter; int ret; + if (copy_from_user(&event, uevent, sizeof(event)) != 0) + return -EFAULT; + ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(hw_event_period, cpu, record_type); + counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, hw_event_type); + ret = hw_perf_counter_init(counter, event.hw_event_type); if (ret) goto err_free_put_context; -- cgit v1.2.3 From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 96c333a5b0f..2557c670a3b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex); * Architecture provided APIs - weak aliases: */ -int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +int __weak hw_perf_counter_init(struct perf_counter *counter) { return -EINVAL; } @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->record_type = record_type; - counter->__irq_period = hw_event_period; + counter->event = *event; counter->wakeup_pending = 0; return counter; @@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); + counter = perf_counter_alloc(&event, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, event.hw_event_type); + ret = hw_perf_counter_init(counter); if (ret) goto err_free_put_context; -- cgit v1.2.3 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2557c670a3b..0d323ceda3a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: return perf_read_hw(counter, buf, count); @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) INIT_LIST_HEAD(&counter->list); init_waitqueue_head(&counter->waitq); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; - counter->cpu = cpu; - counter->record_type = record_type; - counter->event = *event; - counter->wakeup_pending = 0; + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->hw_event = *hw_event; + counter->wakeup_pending = 0; return counter; } /** - * sys_perf_task_open - open a performance counter associate it to a task - * @hw_event_type: event type for monitoring/sampling... + * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * + * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd) +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd) + { struct perf_counter_context *ctx; - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct perf_counter *counter; int ret; - if (copy_from_user(&event, uevent, sizeof(event)) != 0) + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; ctx = find_get_context(pid, cpu); @@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&event, cpu, record_type); + counter = perf_counter_alloc(&hw_event, cpu); if (!counter) goto err_put_context; -- cgit v1.2.3 From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 282 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 216 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0d323ceda3a..fa59fe8c02d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { } * Read the cached counter in counter safe against cross CPU / NMI * modifications. 64 bit version - no complications. */ -static inline u64 perf_read_counter_safe(struct perf_counter *counter) +static inline u64 perf_counter_read_safe(struct perf_counter *counter) { return (u64) atomic64_read(&counter->count); } @@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter) * Read the cached counter in counter safe against cross CPU / NMI * modifications. 32 bit version. */ -static u64 perf_read_counter_safe(struct perf_counter *counter) +static u64 perf_counter_read_safe(struct perf_counter *counter) { u32 cntl, cnth; @@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter) #endif +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *group_leader = counter->group_leader; + + /* + * Depending on whether it is a standalone or sibling counter, + * add it straight to the context's counter list, or to the group + * leader's sibling list: + */ + if (counter->group_leader == counter) + list_add_tail(&counter->list_entry, &ctx->counter_list); + else + list_add_tail(&counter->list_entry, &group_leader->sibling_list); +} + +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *sibling, *tmp; + + list_del_init(&counter->list_entry); + + if (list_empty(&counter->sibling_list)) + return; + + /* + * If this was a group counter with sibling counters then + * upgrade the siblings to singleton counters by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, + &counter->sibling_list, list_entry) { + + list_del_init(&sibling->list_entry); + list_add_tail(&sibling->list_entry, &ctx->counter_list); + WARN_ON_ONCE(!sibling->group_leader); + WARN_ON_ONCE(sibling->group_leader == sibling); + sibling->group_leader = sibling; + } +} + /* * Cross CPU call to remove a performance counter * * We disable the counter on the hardware level first. After that we * remove it from the context list. */ -static void __perf_remove_from_context(void *info) +static void __perf_counter_remove_from_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; @@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_del_init(&counter->list); + list_del_counter(counter, ctx); hw_perf_enable_all(); if (!ctx->task) { @@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info) * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. */ -static void perf_remove_from_context(struct perf_counter *counter) +static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; @@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter) * the removal is always sucessful. */ smp_call_function_single(counter->cpu, - __perf_remove_from_context, + __perf_counter_remove_from_context, counter, 1); return; } retry: - task_oncpu_function_call(task, __perf_remove_from_context, + task_oncpu_function_call(task, __perf_counter_remove_from_context, counter); spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ - if (ctx->nr_active && !list_empty(&counter->list)) { + if (ctx->nr_active && !list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if it the call above did not + * can remove the counter safely, if the call above did not * succeed. */ - if (!list_empty(&counter->list)) { + if (!list_empty(&counter->list_entry)) { ctx->nr_counters--; - list_del_init(&counter->list); + list_del_counter(counter, ctx); counter->task = NULL; } spin_unlock_irq(&ctx->lock); @@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_add_tail(&counter->list, &ctx->counters); + list_add_counter(counter, ctx); hw_perf_enable_all(); ctx->nr_counters++; @@ -268,7 +311,7 @@ retry: * If the context is active and the counter has not been added * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list)) { + if (ctx->nr_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -278,13 +321,45 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list)) { - list_add_tail(&counter->list, &ctx->counters); + if (list_empty(&counter->list_entry)) { + list_add_counter(counter, ctx); ctx->nr_counters++; } spin_unlock_irq(&ctx->lock); } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (!counter->active) + return; + + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + + cpuctx->active_oncpu--; + ctx->nr_active--; +} + +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { - if (!ctx->nr_active) - break; - if (counter->active) { - hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; - ctx->nr_active--; - cpuctx->active_oncpu--; - } + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) + group_sched_out(counter, cpuctx, ctx); } spin_unlock(&ctx->lock); cpuctx->task_ctx = NULL; } +static void +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (!counter->active) + return; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + + cpuctx->active_oncpu++; + ctx->nr_active++; +} + +static void +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter; + + counter_sched_in(group_counter, cpuctx, ctx, cpu); + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_in(counter, cpuctx, ctx, cpu); +} + /* * Called from scheduler to add the counters of the current task * with interrupts disabled. @@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (ctx->nr_active == cpuctx->max_pertask) break; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of counters: + */ if (counter->cpu != -1 && counter->cpu != cpu) continue; - hw_perf_counter_enable(counter); - counter->active = 1; - counter->oncpu = cpu; - ctx->nr_active++; - cpuctx->active_oncpu++; + group_sched_in(counter, cpuctx, ctx, cpu); } spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; } @@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) spin_lock(&ctx->lock); /* - * Rotate the first entry last: + * Rotate the first entry last (works just fine for group counters too): */ hw_perf_disable_all(); - list_for_each_entry(counter, &ctx->counters, list) { - list_del(&counter->list); - list_add_tail(&counter->list, &ctx->counters); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_del(&counter->list_entry); + list_add_tail(&counter->list_entry, &ctx->counter_list); break; } hw_perf_enable_all(); @@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->nr_counters = 0; + ctx->task = task; +} /* * Initialize the perf_counter context in task_struct */ void perf_counter_init_task(struct task_struct *task) { - struct perf_counter_context *ctx = &task->perf_counter_ctx; - - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counters); - ctx->nr_counters = 0; - ctx->task = task; + __perf_counter_init_context(&task->perf_counter_ctx, task); } /* @@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info) hw_perf_counter_read(info); } -static u64 perf_read_counter(struct perf_counter *counter) +static u64 perf_counter_read(struct perf_counter *counter) { /* * If counter is enabled and currently active on a CPU, update the @@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_read_counter_safe(counter); + return perf_counter_read_safe(counter); } /* @@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); @@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return -EINVAL; mutex_lock(&counter->mutex); - cntval = perf_read_counter(counter); + cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); @@ -707,15 +818,25 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, + int cpu, + struct perf_counter *group_leader) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; + /* + * Single counters are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = counter; + mutex_init(&counter->mutex); - INIT_LIST_HEAD(&counter->list); + INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); counter->irqdata = &counter->data[0]; @@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; + counter->group_leader = group_leader; return counter; } @@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open( int group_fd) { - struct perf_counter_context *ctx; + struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; - struct perf_counter *counter; + struct perf_counter_context *ctx; + struct file *group_file = NULL; + int fput_needed = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; + /* + * Look up the group leader: + */ + group_leader = NULL; + if (group_fd != -1) { + ret = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto out_fput; + if (group_file->f_op != &perf_fops) + goto out_fput; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy: + */ + if (group_leader->group_leader) + goto out_fput; + } + + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&hw_event, cpu); + counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; @@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open( if (ret < 0) goto err_remove_free_put_context; +out_fput: + fput_light(group_file, fput_needed); + return ret; err_remove_free_put_context: mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); err_free_put_context: @@ -783,40 +933,40 @@ err_free_put_context: err_put_context: put_context(ctx); - return ret; + goto out_fput; } -static void __cpuinit perf_init_cpu(int cpu) +static void __cpuinit perf_counter_init_cpu(int cpu) { - struct perf_cpu_context *ctx; + struct perf_cpu_context *cpuctx; - ctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_init(&ctx->ctx.lock); - INIT_LIST_HEAD(&ctx->ctx.counters); + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_counter_init_context(&cpuctx->ctx, NULL); mutex_lock(&perf_resource_mutex); - ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_exit_cpu(void *info) +static void __perf_counter_exit_cpu(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = &cpuctx->ctx; struct perf_counter *counter, *tmp; - list_for_each_entry_safe(counter, tmp, &ctx->counters, list) - __perf_remove_from_context(counter); + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) + __perf_counter_remove_from_context(counter); } -static void perf_exit_cpu(int cpu) +static void perf_counter_exit_cpu(int cpu) { - smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); + smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); } #else -static inline void perf_exit_cpu(int cpu) { } +static inline void perf_counter_exit_cpu(int cpu) { } #endif static int __cpuinit @@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - perf_init_cpu(cpu); + perf_counter_init_cpu(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - perf_exit_cpu(cpu); + perf_counter_exit_cpu(cpu); break; default: -- cgit v1.2.3 From ccff286d85098ba5438e22aa2ea807fc1e18cf2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 11:26:29 +0100 Subject: perf counters: group counter, fixes Impact: bugfix Check that a group does not span outside the context of a CPU or a task. Also, do not allow deep recursive hierarchies. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fa59fe8c02d..278209c547a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -107,9 +107,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&counter->list_entry); - if (list_empty(&counter->sibling_list)) - return; - /* * If this was a group counter with sibling counters then * upgrade the siblings to singleton counters by adding them @@ -395,9 +392,6 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (!counter->active) - return; - hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -876,32 +870,39 @@ asmlinkage int sys_perf_counter_open( return -EFAULT; /* - * Look up the group leader: + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this counter to it): */ group_leader = NULL; if (group_fd != -1) { ret = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) - goto out_fput; + goto err_put_context; if (group_file->f_op != &perf_fops) - goto out_fput; + goto err_put_context; group_leader = group_file->private_data; /* - * Do not allow a recursive hierarchy: + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: */ - if (group_leader->group_leader) - goto out_fput; + if (group_leader->ctx != ctx) + goto err_put_context; } - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = -ENOMEM; counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) -- cgit v1.2.3 From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 278209c547a..e6e41ca9546 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ - -int __weak hw_perf_counter_init(struct perf_counter *counter) +extern __weak struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { - return -EINVAL; + return ERR_PTR(-EINVAL); } -void __weak hw_perf_counter_enable(struct perf_counter *counter) { } -void __weak hw_perf_counter_disable(struct perf_counter *counter) { } -void __weak hw_perf_counter_read(struct perf_counter *counter) { } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); if (counter->active) { - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; ctx->nr_active--; cpuctx->active_oncpu--; @@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; ctx->nr_active++; @@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter, if (!counter->active) return; - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; counter->oncpu = -1; @@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task) */ static void __hw_perf_counter_read(void *info) { - hw_perf_counter_read(info); + struct perf_counter *counter = info; + + counter->hw_ops->hw_perf_counter_read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + struct hw_perf_counter_ops *hw_ops; + struct perf_counter *counter; + counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; @@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->hw_event = *hw_event; counter->wakeup_pending = 0; counter->group_leader = group_leader; + counter->hw_ops = NULL; + + hw_ops = hw_perf_counter_init(counter); + if (!hw_ops) { + kfree(counter); + return NULL; + } + counter->hw_ops = hw_ops; return counter; } @@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open( if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter); - if (ret) - goto err_free_put_context; - perf_install_in_context(ctx, counter, cpu); ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -927,8 +932,6 @@ err_remove_free_put_context: mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); - -err_free_put_context: kfree(counter); err_put_context: -- cgit v1.2.3 From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 95 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e6e41ca9546..506286e5ba6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak struct hw_perf_counter_ops * +extern __weak const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { return ERR_PTR(-EINVAL); } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +u64 __weak hw_perf_disable_all(void) { return 0; } +void __weak hw_perf_restore_ctrl(u64 ctrl) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter) return (u64) atomic64_read(&counter->count); } +void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} + #else /* @@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter) return cntl | ((u64) cnth) << 32; } +void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} + #endif static void @@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_del_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); if (!ctx->task) { /* @@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_add_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); ctx->nr_counters++; @@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + u64 perf_flags; if (likely(!ctx->nr_counters)) return; @@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); spin_unlock(&ctx->lock); @@ -807,6 +834,42 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; +static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + + atomic64_counter_set(counter, cpu_clock(cpu)); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_clock = { + .hw_perf_counter_enable = cpu_clock_perf_counter_enable, + .hw_perf_counter_disable = cpu_clock_perf_counter_disable, + .hw_perf_counter_read = cpu_clock_perf_counter_read, +}; + +static const struct hw_perf_counter_ops * +sw_perf_counter_init(struct perf_counter *counter) +{ + const struct hw_perf_counter_ops *hw_ops = NULL; + + switch (counter->hw_event.type) { + case PERF_COUNT_CPU_CLOCK: + hw_ops = &perf_ops_cpu_clock; + break; + default: + break; + } + return hw_ops; +} + /* * Allocate and initialize a counter structure */ @@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; - hw_ops = hw_perf_counter_init(counter); + hw_ops = NULL; + if (!hw_event->raw && hw_event->type < 0) + hw_ops = sw_perf_counter_init(counter); + if (!hw_ops) { + hw_ops = hw_perf_counter_init(counter); + } + if (!hw_ops) { kfree(counter); return NULL; @@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open( goto err_put_context; } - ret = -ENOMEM; + ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; -- cgit v1.2.3 From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 506286e5ba6..0e93fea1712 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter) return ERR_PTR(-EINVAL); } -u64 __weak hw_perf_disable_all(void) { return 0; } -void __weak hw_perf_restore_ctrl(u64 ctrl) { } +u64 __weak hw_perf_save_disable(void) { return 0; } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_del_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); if (!ctx->task) { /* @@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_add_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); ctx->nr_counters++; @@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); -- cgit v1.2.3 From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:03:20 +0100 Subject: perf counters: implement PERF_COUNT_TASK_CLOCK Impact: add new perf-counter type The 'task clock' counter counts the amount of time a task is executing, in nanoseconds. It stops ticking when a task is scheduled out either due to it blocking, sleeping or it being preempted. This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e93fea1712..a0fe8474ee2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .hw_perf_counter_read = cpu_clock_perf_counter_read, }; +static void task_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + atomic64_counter_set(counter, current->se.sum_exec_runtime); +} + +static const struct hw_perf_counter_ops perf_ops_task_clock = { + .hw_perf_counter_enable = task_clock_perf_counter_enable, + .hw_perf_counter_disable = task_clock_perf_counter_disable, + .hw_perf_counter_read = task_clock_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; break; + case PERF_COUNT_TASK_CLOCK: + hw_ops = &perf_ops_task_clock; + break; default: break; } -- cgit v1.2.3 From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:59:31 +0100 Subject: perf counters: add prctl interface to disable/enable counters Add a way for self-monitoring tasks to disable/enable counters summarily, via a prctl: PR_TASK_PERF_COUNTERS_DISABLE 31 PR_TASK_PERF_COUNTERS_ENABLE 32 Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/sys.c | 7 +++++ 2 files changed, 86 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a0fe8474ee2..4e679b91d8b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { + if (counter->active == -1) + return; + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) cpuctx->task_ctx = ctx; } +int perf_counter_task_disable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + WARN_ON_ONCE(counter->active == 1); + counter->active = -1; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + local_irq_enable(); + + return 0; +} + +int perf_counter_task_enable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->active != -1) + continue; + counter->active = 0; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); + + local_irq_enable(); + + return 0; +} + void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; @@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd) - +asmlinkage int +sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, int cpu, int group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d1..0f66633be31 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_TASK_PERF_COUNTERS_DISABLE: + error = perf_counter_task_disable(); + break; + case PR_TASK_PERF_COUNTERS_ENABLE: + error = perf_counter_task_enable(); + break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; break; -- cgit v1.2.3 From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4e679b91d8b..559130b8774 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; counter->task = NULL; @@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info) if (cpuctx->active_oncpu < perf_max_counters) { counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; @@ -328,7 +328,6 @@ retry: spin_lock_irq(&ctx->lock); /* - * If the context is active and the counter has not been added * we need to retry the smp call. */ if (ctx->nr_active && list_empty(&counter->list_entry)) { @@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx) { - if (!counter->active) + if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; cpuctx->active_oncpu--; ctx->nr_active--; @@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->active == -1) + if (counter->state == PERF_COUNTER_STATE_OFF) return; counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; @@ -506,8 +505,8 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->active == 1); - counter->active = -1; + WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + counter->state = PERF_COUNTER_STATE_OFF; } hw_perf_restore(perf_flags); @@ -540,9 +539,9 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->active != -1) + if (counter->state != PERF_COUNTER_STATE_OFF) continue; - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; } hw_perf_restore(perf_flags); @@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter) * If counter is enabled and currently active on a CPU, update the * value in the counter structure: */ - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __hw_perf_counter_read, counter, 1); } @@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) retry: spin_lock_irq(&ctx->lock); - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { counter->irqdata = counter->usrdata; counter->usrdata = oldirqdata; spin_unlock_irq(&ctx->lock); -- cgit v1.2.3 From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 68 ++++----------------------------------------------- 1 file changed, 5 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 559130b8774..416861ce8b2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter) } u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } -#if BITS_PER_LONG == 64 - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 64 bit version - no complications. - */ -static inline u64 perf_counter_read_safe(struct perf_counter *counter) -{ - return (u64) atomic64_read(&counter->count); -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} - -#else - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 32 bit version. - */ -static u64 perf_counter_read_safe(struct perf_counter *counter) -{ - u32 cntl, cnth; - - local_irq_disable(); - do { - cnth = atomic_read(&counter->count32[1]); - cntl = atomic_read(&counter->count32[0]); - } while (cnth != atomic_read(&counter->count32[1])); - - local_irq_enable(); - - return cntl | ((u64) cnth) << 32; -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} - -#endif - static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - counter->hw_ops->hw_perf_counter_enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; + counter->hw_ops->hw_perf_counter_enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_counter_read_safe(counter); + return atomic64_read(&counter->count); } /* @@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); - atomic64_counter_set(counter, cpu_clock(cpu)); + atomic64_set(&counter->count, cpu_clock(cpu)); } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { @@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - atomic64_counter_set(counter, current->se.sum_exec_runtime); + atomic64_set(&counter->count, current->se.sum_exec_runtime); } static const struct hw_perf_counter_ops perf_ops_task_clock = { -- cgit v1.2.3 From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- kernel/exit.c | 7 +- kernel/perf_counter.c | 248 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 210 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f..d336c90a5f1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif -#ifdef CONFIG_FUTEX /* - * This must happen late, after the PID is not - * hashed anymore: + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: */ + perf_counter_exit_task(tsk); +#ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 416861ce8b2..f5e81dd193d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&sibling->list_entry); list_add_tail(&sibling->list_entry, &ctx->counter_list); - WARN_ON_ONCE(!sibling->group_leader); - WARN_ON_ONCE(sibling->group_leader == sibling); sibling->group_leader = sibling; } } @@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; u64 perf_flags; /* @@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); @@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + unsigned long flags; u64 perf_flags; /* @@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * Protect the list operation against NMI by disabling the @@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -446,10 +446,9 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + list_for_each_entry(counter, &ctx->counter_list, list_entry) counter->state = PERF_COUNTER_STATE_OFF; - } + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counter_list); - ctx->nr_counters = 0; - ctx->task = task; -} -/* - * Initialize the perf_counter context in task_struct - */ -void perf_counter_init_task(struct task_struct *task) -{ - __perf_counter_init_context(&task->perf_counter_ctx, task); -} - /* * Cross CPU call to read the hardware counter */ @@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; - WARN_ON_ONCE(ctx->task); return ctx; } @@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, - struct perf_counter *group_leader) + struct perf_counter *group_leader, + gfp_t gfpflags) { const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; - counter = kzalloc(sizeof(*counter), GFP_KERNEL); + counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) return NULL; @@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) { + if (!hw_ops) hw_ops = hw_perf_counter_init(counter); - } if (!hw_ops) { kfree(counter); @@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; struct perf_counter_context *ctx; + struct file *counter_file = NULL; struct file *group_file = NULL; int fput_needed = 0; + int fput_needed2 = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) @@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader); + counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); if (!counter) goto err_put_context; - perf_install_in_context(ctx, counter, cpu); - ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); if (ret < 0) - goto err_remove_free_put_context; + goto err_free_put_context; + + counter_file = fget_light(ret, &fput_needed2); + if (!counter_file) + goto err_free_put_context; + + counter->filp = counter_file; + perf_install_in_context(ctx, counter, cpu); + + fput_light(counter_file, fput_needed2); out_fput: fput_light(group_file, fput_needed); return ret; -err_remove_free_put_context: - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&counter->mutex); +err_free_put_context: kfree(counter); err_put_context: @@ -1044,6 +1028,186 @@ err_put_context: goto out_fput; } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->task = task; +} + +/* + * inherit a counter from parent task to child task: + */ +static int +inherit_counter(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *child_counter; + + child_counter = perf_counter_alloc(&parent_counter->hw_event, + parent_counter->cpu, NULL, + GFP_ATOMIC); + if (!child_counter) + return -ENOMEM; + + /* + * Link it up in the child's context: + */ + child_counter->ctx = child_ctx; + child_counter->task = child; + list_add_counter(child_counter, child_ctx); + child_ctx->nr_counters++; + + child_counter->parent = parent_counter; + parent_counter->nr_inherited++; + /* + * inherit into child's child as well: + */ + child_counter->hw_event.inherit = 1; + + /* + * Get a reference to the parent filp - we will fput it + * when the child counter exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_counter->filp->f_count); + + return 0; +} + +static void +__perf_counter_exit_task(struct task_struct *child, + struct perf_counter *child_counter, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *parent_counter; + u64 parent_val, child_val; + u64 perf_flags; + + /* + * Disable and unlink this counter. + * + * Be careful about zapping the list - IRQ/NMI context + * could still be processing it: + */ + local_irq_disable(); + perf_flags = hw_perf_save_disable(); + + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) + child_counter->hw_ops->hw_perf_counter_disable(child_counter); + list_del_init(&child_counter->list_entry); + + hw_perf_restore(perf_flags); + local_irq_enable(); + + parent_counter = child_counter->parent; + /* + * It can happen that parent exits first, and has counters + * that are still around due to the child reference. These + * counters need to be zapped - but otherwise linger. + */ + if (!parent_counter) + return; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + fput(parent_counter->filp); + + kfree(child_counter); +} + +/* + * When a child task exist, feed back counter values to parent counters. + * + * Note: we are running in child context, but the PID is not hashed + * anymore so new counters will not be added. + */ +void perf_counter_exit_task(struct task_struct *child) +{ + struct perf_counter *child_counter, *tmp; + struct perf_counter_context *child_ctx; + + child_ctx = &child->perf_counter_ctx; + + if (likely(!child_ctx->nr_counters)) + return; + + list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, + list_entry) + __perf_counter_exit_task(child, child_counter, child_ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *child) +{ + struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter *counter, *parent_counter; + struct task_struct *parent = current; + unsigned long flags; + + child_ctx = &child->perf_counter_ctx; + parent_ctx = &parent->perf_counter_ctx; + + __perf_counter_init_context(child_ctx, child); + + /* + * This is executed from the parent task context, so inherit + * counters that have been marked for cloning: + */ + + if (likely(!parent_ctx->nr_counters)) + return; + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + spin_lock_irqsave(&parent_ctx->lock, flags); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + if (!counter->hw_event.inherit || counter->group_leader != counter) + continue; + + /* + * Instead of creating recursive hierarchies of counters, + * we link inheritd counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + parent_counter = counter; + if (counter->parent) + parent_counter = counter->parent; + + if (inherit_counter(parent_counter, parent, + parent_ctx, child, child_ctx)) + break; + } + + spin_unlock_irqrestore(&parent_ctx->lock, flags); +} + static void __cpuinit perf_counter_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; -- cgit v1.2.3 From 8cb391e8786c8072367f0aeb90551903fef074ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:22:31 +0100 Subject: perfcounters: fix task clock counter Impact: bugfix Update the task clock counter to the new math. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f5e81dd193d..1f81cde0dc4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -850,17 +850,36 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .hw_perf_counter_read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_enable(struct perf_counter *counter) +static void task_clock_perf_counter_update(struct perf_counter *counter) { + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = current->se.sum_exec_runtime; + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); } -static void task_clock_perf_counter_disable(struct perf_counter *counter) +static void task_clock_perf_counter_read(struct perf_counter *counter) { + task_clock_perf_counter_update(counter); } -static void task_clock_perf_counter_read(struct perf_counter *counter) +static void task_clock_perf_counter_enable(struct perf_counter *counter) +{ + atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) { - atomic64_set(&counter->count, current->se.sum_exec_runtime); + task_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_task_clock = { -- cgit v1.2.3 From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:28:33 +0100 Subject: perfcounters: add context switch counter Impact: add new feature, new sw counter Add a counter that counts the number of context-switches a task is doing. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1f81cde0dc4..09287091c52 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .hw_perf_counter_read = task_clock_perf_counter_read, }; +static u64 get_context_switches(void) +{ + struct task_struct *curr = current; + + return curr->nvcsw + curr->nivcsw; +} + +static void context_switches_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_context_switches(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void context_switches_perf_counter_read(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static void context_switches_perf_counter_enable(struct perf_counter *counter) +{ + /* + * ->nvcsw + curr->nivcsw is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void context_switches_perf_counter_disable(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_context_switches = { + .hw_perf_counter_enable = context_switches_perf_counter_enable, + .hw_perf_counter_disable = context_switches_perf_counter_disable, + .hw_perf_counter_read = context_switches_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_TASK_CLOCK: hw_ops = &perf_ops_task_clock; break; + case PERF_COUNT_CONTEXT_SWITCHES: + hw_ops = &perf_ops_context_switches; + break; default: break; } -- cgit v1.2.3 From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched.c | 7 +++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09287091c52..fb11e351e44 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .hw_perf_counter_read = context_switches_perf_counter_read, }; +static inline u64 get_cpu_migrations(void) +{ + return current->se.nr_migrations; +} + +static void cpu_migrations_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_cpu_migrations(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void cpu_migrations_perf_counter_read(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +{ + /* + * se.nr_migrations is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { + .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, + .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, + .hw_perf_counter_read = cpu_migrations_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CONTEXT_SWITCHES: hw_ops = &perf_ops_context_switches; break; + case PERF_COUNT_CPU_MIGRATIONS: + hw_ops = &perf_ops_cpu_migrations; + break; default: break; } diff --git a/kernel/sched.c b/kernel/sched.c index 5c3f4106314..382cfdb5e38 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; -- cgit v1.2.3 From e06c61a879910869aa5bf3f8f634abfee1a7bebc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 14:44:31 +0100 Subject: perfcounters: add nr-of-faults counter Impact: add new feature, new sw counter Add a counter that counts the number of pagefaults a task is experiencing. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fb11e351e44..59c52f9ee43 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .hw_perf_counter_read = task_clock_perf_counter_read, }; +static u64 get_page_faults(void) +{ + struct task_struct *curr = current; + + return curr->maj_flt + curr->min_flt; +} + +static void page_faults_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_page_faults(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void page_faults_perf_counter_read(struct perf_counter *counter) +{ + page_faults_perf_counter_update(counter); +} + +static void page_faults_perf_counter_enable(struct perf_counter *counter) +{ + /* + * page-faults is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void page_faults_perf_counter_disable(struct perf_counter *counter) +{ + page_faults_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_page_faults = { + .hw_perf_counter_enable = page_faults_perf_counter_enable, + .hw_perf_counter_disable = page_faults_perf_counter_disable, + .hw_perf_counter_read = page_faults_perf_counter_read, +}; + static u64 get_context_switches(void) { struct task_struct *curr = current; @@ -994,6 +1042,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_TASK_CLOCK: hw_ops = &perf_ops_task_clock; break; + case PERF_COUNT_PAGE_FAULTS: + hw_ops = &perf_ops_page_faults; + break; case PERF_COUNT_CONTEXT_SWITCHES: hw_ops = &perf_ops_context_switches; break; -- cgit v1.2.3 From 088e2852c858159d47f71ee8da38e0fb1b21f806 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 20:21:00 +0100 Subject: perfcounters, x86: fix sw counters on non-PMC CPUs Make perf_max_counters default to at least 1 - this allows the sw counters to be used. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 59c52f9ee43..539fa8283a0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -25,7 +25,7 @@ */ DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); -int perf_max_counters __read_mostly; +int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; -- cgit v1.2.3 From 0cc0c027d4e028632933f1be2dc4cd730358183b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 23:20:36 +0100 Subject: perfcounters: release CPU context when exiting task counters If counters are exiting via do_exit() not via filp close, then the CPU context needs to be released - otherwise future percpu counter creations might fail. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 539fa8283a0..16396e9406f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1273,8 +1273,19 @@ __perf_counter_exit_task(struct task_struct *child, local_irq_disable(); perf_flags = hw_perf_save_disable(); - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + struct perf_cpu_context *cpuctx; + + cpuctx = &__get_cpu_var(perf_cpu_context); + child_counter->hw_ops->hw_perf_counter_disable(child_counter); + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + child_counter->oncpu = -1; + + cpuctx->active_oncpu--; + child_ctx->nr_active--; + } + list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); @@ -1539,4 +1550,3 @@ static int __init perf_counter_sysfs_init(void) &perfclass_attr_group); } device_initcall(perf_counter_sysfs_init); - -- cgit v1.2.3 From a86ed50859d65a08beec9474df97b88438a996df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 00:43:10 +0100 Subject: perfcounters: use hw_event.disable flag Impact: implement default-off counters Make sure that counters that are created with counter.hw_event.disabled=1, get created in disabled state. They can be enabled via: prctl(PR_TASK_PERF_COUNTERS_ENABLE); Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16396e9406f..5431e790b5d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1093,6 +1093,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; + if (hw_event->disabled) + counter->state = PERF_COUNTER_STATE_OFF; + hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); -- cgit v1.2.3 From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5431e790b5d..aab6c123b02 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } -- cgit v1.2.3 From 7995888fcb0246543ee8027bf2835a250ba8c925 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 08:54:56 +0100 Subject: perfcounters: tweak group scheduling Impact: schedule in groups atomically If there are multiple groups in a task, make sure they are scheduled in and out atomically. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index aab6c123b02..f8a4d9a5d5d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -367,21 +367,26 @@ counter_sched_in(struct perf_counter *counter, ctx->nr_active++; } -static void +static int group_sched_in(struct perf_counter *group_counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { struct perf_counter *counter; + int was_group = 0; counter_sched_in(group_counter, cpuctx, ctx, cpu); /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { counter_sched_in(counter, cpuctx, ctx, cpu); + was_group = 1; + } + + return was_group; } /* @@ -416,7 +421,12 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) if (counter->cpu != -1 && counter->cpu != cpu) continue; - group_sched_in(counter, cpuctx, ctx, cpu); + /* + * If we scheduled in a group atomically and + * exclusively, break out: + */ + if (group_sched_in(counter, cpuctx, ctx, cpu)) + break; } spin_unlock(&ctx->lock); -- cgit v1.2.3 From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f8a4d9a5d5d..961d651aa57 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; @@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info) counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; @@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter, * * We stop each counter and update the counter value in counter->count. * - * This does not protect us against NMI, but hw_perf_counter_disable() + * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * not restart the counter. @@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_OFF) return; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter, * * We restore the counter value and then enable it. * - * This does not protect us against NMI, but hw_perf_counter_enable() + * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * keep the counter running. @@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Cross CPU call to read the hardware counter */ -static void __hw_perf_counter_read(void *info) +static void __read(void *info) { struct perf_counter *counter = info; - counter->hw_ops->hw_perf_counter_read(counter); + counter->hw_ops->read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __hw_perf_counter_read, counter, 1); + __read, counter, 1); } return atomic64_read(&counter->count); @@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { - .hw_perf_counter_enable = cpu_clock_perf_counter_enable, - .hw_perf_counter_disable = cpu_clock_perf_counter_disable, - .hw_perf_counter_read = cpu_clock_perf_counter_read, + .enable = cpu_clock_perf_counter_enable, + .disable = cpu_clock_perf_counter_disable, + .read = cpu_clock_perf_counter_read, }; static void task_clock_perf_counter_update(struct perf_counter *counter) @@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_task_clock = { - .hw_perf_counter_enable = task_clock_perf_counter_enable, - .hw_perf_counter_disable = task_clock_perf_counter_disable, - .hw_perf_counter_read = task_clock_perf_counter_read, + .enable = task_clock_perf_counter_enable, + .disable = task_clock_perf_counter_disable, + .read = task_clock_perf_counter_read, }; static u64 get_page_faults(void) @@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_page_faults = { - .hw_perf_counter_enable = page_faults_perf_counter_enable, - .hw_perf_counter_disable = page_faults_perf_counter_disable, - .hw_perf_counter_read = page_faults_perf_counter_read, + .enable = page_faults_perf_counter_enable, + .disable = page_faults_perf_counter_disable, + .read = page_faults_perf_counter_read, }; static u64 get_context_switches(void) @@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_context_switches = { - .hw_perf_counter_enable = context_switches_perf_counter_enable, - .hw_perf_counter_disable = context_switches_perf_counter_disable, - .hw_perf_counter_read = context_switches_perf_counter_read, + .enable = context_switches_perf_counter_enable, + .disable = context_switches_perf_counter_disable, + .read = context_switches_perf_counter_read, }; static inline u64 get_cpu_migrations(void) @@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { - .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, - .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, - .hw_perf_counter_read = cpu_migrations_perf_counter_read, + .enable = cpu_migrations_perf_counter_enable, + .disable = cpu_migrations_perf_counter_disable, + .read = cpu_migrations_perf_counter_read, }; static const struct hw_perf_counter_ops * @@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - child_counter->hw_ops->hw_perf_counter_disable(child_counter); + child_counter->hw_ops->disable(child_counter); child_counter->state = PERF_COUNTER_STATE_INACTIVE; child_counter->oncpu = -1; -- cgit v1.2.3 From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- kernel/exit.c | 17 +++++++++---- kernel/perf_counter.c | 70 +++++++++++++++++++++++++++++++++++++++------------ kernel/sched.c | 49 +++++++++++++++++++++++++++++++++--- 3 files changed, 112 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f1..244edfd9686 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 961d651aa57..f1110ac1267 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->disable(counter); @@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } @@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the @@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } /* @@ -438,15 +443,19 @@ int perf_counter_task_disable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -463,7 +472,7 @@ int perf_counter_task_disable(void) spin_unlock(&ctx->lock); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -473,15 +482,19 @@ int perf_counter_task_enable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + spin_lock(&ctx->lock); /* @@ -493,6 +506,7 @@ int perf_counter_task_enable(void) if (counter->state != PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -500,7 +514,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + unsigned long flags; + curr_rq_lock_irq_save(&flags); counter->hw_ops->read(counter); + curr_rq_unlock_irq_restore(&flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_update(struct perf_counter *counter) +/* + * Called from within the scheduler: + */ +static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) { - u64 prev, now; + struct task_struct *curr = counter->task; + u64 delta; + + WARN_ON_ONCE(counter->task != current); + + delta = __task_delta_exec(curr, update); + + return curr->se.sum_exec_runtime + delta; +} + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = current->se.sum_exec_runtime; atomic64_set(&counter->hw.prev_count, now); @@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 1); + + task_clock_perf_counter_update(counter, now); } static void task_clock_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); + u64 now = task_clock_perf_counter_val(counter, 0); + + atomic64_set(&counter->hw.prev_count, now); } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 0); + + task_clock_perf_counter_update(counter, now); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; + unsigned long flags; u64 perf_flags; /* @@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - local_irq_disable(); + curr_rq_lock_irq_save(&flags); perf_flags = hw_perf_save_disable(); if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { @@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child, list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); parent_counter = child_counter->parent; /* diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38..4d84ff4c877 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- kernel/exit.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 244edfd9686..101b7eeff44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); - preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; @@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; -- cgit v1.2.3 From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 62 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f1110ac1267..2e73929a695 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) cpuctx->task_ctx = NULL; } -static void +static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { if (counter->state == PERF_COUNTER_STATE_OFF) - return; + return 0; + + if (counter->hw_ops->enable(counter)) + return -EAGAIN; - counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; ctx->nr_active++; + + return 0; } static int @@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter, struct perf_counter_context *ctx, int cpu) { - struct perf_counter *counter; - int was_group = 0; + struct perf_counter *counter, *partial_group; + int ret = 0; - counter_sched_in(group_counter, cpuctx, ctx, cpu); + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter_sched_in(counter, cpuctx, ctx, cpu); - was_group = 1; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + ret = -EAGAIN; } - return was_group; + return ret; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; } /* @@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (ctx->nr_active == cpuctx->max_pertask) - break; - /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -856,8 +875,9 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; -static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { + return 0; } static void cpu_clock_perf_counter_disable(struct perf_counter *counter) @@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, now); } -static void task_clock_perf_counter_enable(struct perf_counter *counter) +static int task_clock_perf_counter_enable(struct perf_counter *counter) { u64 now = task_clock_perf_counter_val(counter, 0); atomic64_set(&counter->hw.prev_count, now); + + return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) @@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) page_faults_perf_counter_update(counter); } -static void page_faults_perf_counter_enable(struct perf_counter *counter) +static int page_faults_perf_counter_enable(struct perf_counter *counter) { /* * page-faults is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void page_faults_perf_counter_disable(struct perf_counter *counter) @@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) context_switches_perf_counter_update(counter); } -static void context_switches_perf_counter_enable(struct perf_counter *counter) +static int context_switches_perf_counter_enable(struct perf_counter *counter) { /* * ->nvcsw + curr->nivcsw is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void context_switches_perf_counter_disable(struct perf_counter *counter) @@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { /* * se.nr_migrations is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) -- cgit v1.2.3 From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:29:25 +0100 Subject: perfcounters: remove ->nr_inherited Impact: remove dead code nr_inherited was not maintained correctly (not decremented) - and also not used - remove it. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2e73929a695..48e1dbcdc1c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter, child_ctx->nr_counters++; child_counter->parent = parent_counter; - parent_counter->nr_inherited++; /* * inherit into child's child as well: */ -- cgit v1.2.3 From 235c7fc7c500e4fd1700c4ad01b5612bcdc1b449 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 14:43:25 +0100 Subject: perfcounters: generalize the counter scheduler Impact: clean up and refactor code refactor the counter scheduler: separate out in/out functions and introduce a counter-rotation function as well. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 220 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 142 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48e1dbcdc1c..d7a79f321b1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); ctx->nr_active--; cpuctx->active_oncpu--; counter->task = NULL; + counter->oncpu = -1; } ctx->nr_counters--; @@ -192,8 +193,36 @@ retry: spin_unlock_irq(&ctx->lock); } +static int +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + counter->state = PERF_COUNTER_STATE_ACTIVE; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (counter->hw_ops->enable(counter)) { + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; + return -EAGAIN; + } + + cpuctx->active_oncpu++; + ctx->nr_active++; + + return 0; +} + /* - * Cross CPU call to install and enable a preformance counter + * Cross CPU call to install and enable a performance counter */ static void __perf_install_in_context(void *info) { @@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info) * counters on a global level. NOP for non NMI based counters. */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - hw_perf_restore(perf_flags); + list_add_counter(counter, ctx); ctx->nr_counters++; - if (cpuctx->active_oncpu < perf_max_counters) { - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; - ctx->nr_active++; - cpuctx->active_oncpu++; - counter->hw_ops->enable(counter); - } + counter_sched_in(counter, cpuctx, ctx, cpu); if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; + hw_perf_restore(perf_flags); + spin_unlock(&ctx->lock); curr_rq_unlock_irq_restore(&flags); } @@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; - counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); counter->oncpu = -1; cpuctx->active_oncpu--; @@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter, counter_sched_out(counter, cpuctx, ctx); } +void __perf_counter_sched_out(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + spin_lock(&ctx->lock); + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) + group_sched_out(counter, cpuctx, ctx); + } + spin_unlock(&ctx->lock); +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; - struct perf_counter *counter; if (likely(!cpuctx->task_ctx)) return; - spin_lock(&ctx->lock); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) - group_sched_out(counter, cpuctx, ctx); - } - spin_unlock(&ctx->lock); + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; } -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { - if (counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - if (counter->hw_ops->enable(counter)) - return -EAGAIN; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - - cpuctx->active_oncpu++; - ctx->nr_active++; - - return 0; + __perf_counter_sched_out(&cpuctx->ctx, cpuctx); } static int @@ -416,21 +435,10 @@ group_error: return -EAGAIN; } -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) +static void +__perf_counter_sched_in(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; struct perf_counter *counter; if (likely(!ctx->nr_counters)) @@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) break; } spin_unlock(&ctx->lock); +} +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + + __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } +static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_counter_context *ctx = &cpuctx->ctx; + + __perf_counter_sched_in(ctx, cpuctx, cpu); +} + int perf_counter_task_disable(void) { struct task_struct *curr = current; @@ -514,6 +547,8 @@ int perf_counter_task_enable(void) /* force the update of the task clock: */ __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); + spin_lock(&ctx->lock); /* @@ -538,19 +573,18 @@ int perf_counter_task_enable(void) return 0; } -void perf_counter_task_tick(struct task_struct *curr, int cpu) +/* + * Round-robin a context's counters: + */ +static void rotate_ctx(struct perf_counter_context *ctx) { - struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; u64 perf_flags; - if (likely(!ctx->nr_counters)) + if (!ctx->nr_counters) return; - perf_counter_task_sched_out(curr, cpu); - spin_lock(&ctx->lock); - /* * Rotate the first entry last (works just fine for group counters too): */ @@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + const int rotate_percpu = 0; + + if (rotate_percpu) + perf_counter_cpu_sched_out(cpuctx); + perf_counter_task_sched_out(curr, cpu); + if (rotate_percpu) + rotate_ctx(&cpuctx->ctx); + rotate_ctx(ctx); + + if (rotate_percpu) + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } @@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) struct task_struct *curr = counter->task; u64 delta; - WARN_ON_ONCE(counter->task != current); - delta = __task_delta_exec(curr, update); return curr->se.sum_exec_runtime + delta; @@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; + counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -1331,35 +1381,49 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; - unsigned long flags; - u64 perf_flags; /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: + * If we do not self-reap then we have to wait for the + * child task to unschedule (it will happen for sure), + * so that its counter is at its final count. (This + * condition triggers rarely - child tasks usually get + * off their CPU before the parent has a chance to + * get this far into the reaping action) */ - curr_rq_lock_irq_save(&flags); - perf_flags = hw_perf_save_disable(); - - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + if (child != current) { + wait_task_inactive(child, 0); + list_del_init(&child_counter->list_entry); + } else { struct perf_cpu_context *cpuctx; + unsigned long flags; + u64 perf_flags; + + /* + * Disable and unlink this counter. + * + * Be careful about zapping the list - IRQ/NMI context + * could still be processing it: + */ + curr_rq_lock_irq_save(&flags); + perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); - child_counter->hw_ops->disable(child_counter); - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - child_counter->oncpu = -1; + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + child_counter->hw_ops->disable(child_counter); + cpuctx->active_oncpu--; + child_ctx->nr_active--; + child_counter->oncpu = -1; + } - cpuctx->active_oncpu--; - child_ctx->nr_active--; - } + list_del_init(&child_counter->list_entry); - list_del_init(&child_counter->list_entry); + child_ctx->nr_counters--; - hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + hw_perf_restore(perf_flags); + curr_rq_unlock_irq_restore(&flags); + } parent_counter = child_counter->parent; /* -- cgit v1.2.3 From 01ea1ccaa24dea3552e103be13b7897211607a8b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 26 Dec 2008 21:05:06 -0800 Subject: perf_counter: more barrier in blank weak function Impact: fix panic possible panic Some versions of GCC inline the weak global function if it's empty. Add a barrier() to work it around. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d7a79f321b1..37f771691f9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -45,8 +45,8 @@ hw_perf_counter_init(struct perf_counter *counter) } u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { } -void __weak hw_perf_counter_setup(void) { } +void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_counter_setup(void) { barrier(); } static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -- cgit v1.2.3 From ff6f05416ece2caec1a7a1f8180d6598e0ab9272 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:19:25 +1100 Subject: perf_counter: Fix return value from dummy hw_perf_counter_init Impact: fix oops-causing bug Currently, if you try to use perf_counters on an architecture that has no hardware support, and you select an event that doesn't map to any of the defined software counters, you get an oops rather than an error. This is because the dummy hw_perf_counter_init returns ERR_PTR(-EINVAL) but the caller (perf_counter_alloc) only tests for NULL. This makes the dummy hw_perf_counter_init return NULL instead. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 37f771691f9..4be1a8d872b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -41,7 +41,7 @@ static DEFINE_MUTEX(perf_resource_mutex); extern __weak const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { - return ERR_PTR(-EINVAL); + return NULL; } u64 __weak hw_perf_save_disable(void) { return 0; } -- cgit v1.2.3 From 9abf8a08bc8f18a3b125f834f00e2e71b49c15d2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:26:43 +1100 Subject: perf_counter: Fix the cpu_clock software counter Impact: bug fix Currently if you do (e.g.) timec -e -1 ls, it will report 0 for the value of the cpu_clock counter. The reason is that the core assumes that a counter's count field is up-to-date when the counter is inactive, and doesn't call the counter's read function. However, the cpu_clock counter code only updates the count in the read function. This fixes it by making both the read and disable functions update the count. It also makes the counter ignore time passing while the counter is disabled, by making the enable function update the hw.prev_count field. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4be1a8d872b..b7a027a2ef0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -928,18 +928,32 @@ static const struct file_operations perf_fops = { static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { + int cpu = raw_smp_processor_id(); + + atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); return 0; } +static void cpu_clock_perf_counter_update(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&counter->hw.prev_count); + atomic64_set(&counter->hw.prev_count, now); + atomic64_add(now - prev, &counter->count); +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + cpu_clock_perf_counter_update(counter); } static void cpu_clock_perf_counter_read(struct perf_counter *counter) { - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->count, cpu_clock(cpu)); + cpu_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { -- cgit v1.2.3 From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:43:42 +1100 Subject: perf_counter: Add optional hw_perf_group_sched_in arch function Impact: extend perf_counter infrastructure This adds an optional hw_perf_group_sched_in() arch function that enables a whole group of counters in one go. It returns 1 if it added the group successfully, 0 if it did nothing (and therefore the core needs to add the counters individually), or a negative number if an error occurred. It should add all the counters and enable any software counters in the group, or else add none of them and return an error. There are a couple of related changes/improvements in the group handling here: * As an optimization, group_sched_out() and group_sched_in() now check the state of the group leader, and do nothing if the leader is not active or disabled. * We now call hw_perf_save_disable/hw_perf_restore around the complete set of counter enable/disable calls in __perf_counter_sched_in/out, to give the arch code the opportunity to defer updating the hardware state until the hw_perf_restore call if it wants. * We no longer stop adding groups after we get to a group that has more than one counter. We will ultimately add an option for a group to be exclusive. The current code doesn't really implement exclusive groups anyway, since a group could end up going on with other counters that get added before it. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b7a027a2ef0..9ad11e44d9a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter) u64 __weak hw_perf_save_disable(void) { return 0; } void __weak hw_perf_restore(u64 ctrl) { barrier(); } void __weak hw_perf_counter_setup(void) { barrier(); } +int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + return 0; +} static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter, { struct perf_counter *counter; + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + counter_sched_out(group_counter, cpuctx, ctx); /* @@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } @@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter, int cpu) { struct perf_counter *counter, *partial_group; - int ret = 0; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter, partial_group = counter; goto group_error; } - ret = -EAGAIN; } - return ret; + return 0; group_error: /* @@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { /* * Listen to the 'cpu' scheduling filter constraint @@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, continue; /* - * If we scheduled in a group atomically and - * exclusively, break out: + * If we scheduled in a group atomically and exclusively, + * or if this group can't go on, break out: */ if (group_sched_in(counter, cpuctx, ctx, cpu)) break; } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } -- cgit v1.2.3 From 4eb96fcfe07b7f2a05577e57533840f8e26bea53 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 17:24:34 +1100 Subject: perf_counter: Add dummy perf_counter_print_debug function Impact: minimize requirements on architectures Currently, an architecture just enabling CONFIG_PERF_COUNTERS but not providing any extra functions will fail to build with perf_counter_print_debug being undefined, since we don't provide an empty dummy definition like we do with the hw_perf_* functions. This provides an empty dummy perf_counter_print_debug() to make it easier for architectures to turn on CONFIG_PERF_COUNTERS. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9ad11e44d9a..4c0dccb756a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -54,6 +54,8 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, return 0; } +void __weak perf_counter_print_debug(void) { } + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { -- cgit v1.2.3 From dd0e6ba22ea21bcc2c420b385a170593c58f4c08 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Jan 2009 15:11:00 +1100 Subject: perf_counter: Always schedule all software counters in Software counters aren't subject to the limitations imposed by the fixed number of hardware counter registers, so there is no reason not to enable them all in __perf_counter_sched_in. Previously we used to break out of the loop when we got to a group that wouldn't fit on the PMU; with this we continue through the list but only schedule in software counters (or groups containing only software counters) from there on. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c0dccb756a..3aef3062ff7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -455,12 +455,37 @@ group_error: return -EAGAIN; } +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + return 1; +} + static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; u64 flags; + int can_add_hw = 1; if (likely(!ctx->nr_counters)) return; @@ -477,10 +502,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, /* * If we scheduled in a group atomically and exclusively, - * or if this group can't go on, break out: + * or if this group can't go on, don't add any more + * hardware counters. */ - if (group_sched_in(counter, cpuctx, ctx, cpu)) - break; + if (can_add_hw || is_software_only_group(counter)) + if (group_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; } hw_perf_restore(flags); spin_unlock(&ctx->lock); -- cgit v1.2.3 From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 13:44:19 +1100 Subject: powerpc/perf_counter: Make sure PMU gets enabled properly This makes sure that we call the platform-specific ppc_md.enable_pmcs function on each CPU before we try to use the PMU on that CPU. If the CPU goes off-line and then on-line, we need to do the enable_pmcs call again, so we use the hw_perf_counter_setup hook to ensure that. It gets called as each CPU comes online, but it isn't called on the CPU that is coming up, so this adds the CPU number as an argument to it (there were no non-empty instances of hw_perf_counter_setup before). This also arranges to set the pmcregs_in_use field of the lppaca (data structure shared with the hypervisor) on each CPU when we are using the PMU and clear it when we are not. This allows the hypervisor to optimize partition switches by not saving/restoring the PMU registers when we aren't using the PMU. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3aef3062ff7..52f2f526248 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,7 +46,7 @@ hw_perf_counter_init(struct perf_counter *counter) u64 __weak hw_perf_save_disable(void) { return 0; } void __weak hw_perf_restore(u64 ctrl) { barrier(); } -void __weak hw_perf_counter_setup(void) { barrier(); } +void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) @@ -1598,7 +1598,7 @@ static void __cpuinit perf_counter_init_cpu(int cpu) cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; mutex_unlock(&perf_resource_mutex); - hw_perf_counter_setup(); + hw_perf_counter_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 21:00:30 +1100 Subject: perf_counter: Add support for pinned and exclusive counter groups Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 226 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 154 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52f2f526248..faf671b2956 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); + counter->oncpu = -1; + + if (!is_software_counter(counter)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - ctx->nr_active--; - cpuctx->active_oncpu--; - counter->task = NULL; - counter->oncpu = -1; - } + counter_sched_out(counter, cpuctx, ctx); + + counter->task = NULL; ctx->nr_counters--; /* @@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state <= PERF_COUNTER_STATE_OFF) return 0; counter->state = PERF_COUNTER_STATE_ACTIVE; @@ -223,12 +237,63 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - cpuctx->active_oncpu++; + if (!is_software_counter(counter)) + cpuctx->active_oncpu++; ctx->nr_active++; + if (counter->hw_event.exclusive) + cpuctx->exclusive = 1; + return 0; } +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software counters can always go on. + */ + if (is_software_only_group(counter)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * counters can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * counters on the CPU, it can't go on. + */ + if (counter->hw_event.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info) int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; + int err; /* * If this is a task context, we need to check whether it is @@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; - counter_sched_in(counter, cpuctx, ctx, cpu); + /* + * An exclusive counter can't go on if there are already active + * hardware counters, and no hardware counter can go on if there + * is already an exclusive counter on. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE && + !group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, cpu); + + if (err && counter->hw_event.pinned) + counter->state = PERF_COUNTER_STATE_ERROR; - if (!ctx->task && cpuctx->max_pertask) + if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; hw_perf_restore(perf_flags); @@ -326,22 +404,6 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - counter->oncpu = -1; - - cpuctx->active_oncpu--; - ctx->nr_active--; -} - static void group_sched_out(struct perf_counter *group_counter, struct perf_cpu_context *cpuctx, @@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter, */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -455,30 +520,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - return 1; -} - static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) @@ -492,7 +533,38 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, spin_lock(&ctx->lock); flags = hw_perf_save_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state <= PERF_COUNTER_STATE_OFF || + !counter->hw_event.pinned) + continue; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_ERROR; + } + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + /* + * Ignore counters in OFF or ERROR state, and + * ignore pinned counters since we did them already. + */ + if (counter->state <= PERF_COUNTER_STATE_OFF || + counter->hw_event.pinned) + continue; + /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -500,14 +572,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - /* - * If we scheduled in a group atomically and exclusively, - * or if this group can't go on, don't add any more - * hardware counters. - */ - if (can_add_hw || is_software_only_group(counter)) + if (group_can_go_on(counter, cpuctx, can_add_hw)) { if (group_sched_in(counter, cpuctx, ctx, cpu)) can_add_hw = 0; + } } hw_perf_restore(flags); spin_unlock(&ctx->lock); @@ -567,8 +635,10 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) - counter->state = PERF_COUNTER_STATE_OFF; + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + } hw_perf_restore(perf_flags); @@ -607,7 +677,7 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_OFF) + if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->hw_event.disabled = 0; @@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (count != sizeof(cntval)) return -EINVAL; + /* + * Return end-of-file for a read on a counter that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + return 0; + mutex_lock(&counter->mutex); cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); @@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter *counter, { struct perf_data *irqdata, *usrdata; DECLARE_WAITQUEUE(wait, current); - ssize_t res; + ssize_t res, res2; irqdata = counter->irqdata; usrdata = counter->usrdata; @@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter *counter, if (signal_pending(current)) break; + if (counter->state == PERF_COUNTER_STATE_ERROR) + break; + spin_unlock_irq(&counter->waitq.lock); schedule(); spin_lock_irq(&counter->waitq.lock); @@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter *counter, __set_current_state(TASK_RUNNING); spin_unlock_irq(&counter->waitq.lock); - if (usrdata->len + irqdata->len < count) + if (usrdata->len + irqdata->len < count && + counter->state != PERF_COUNTER_STATE_ERROR) return -ERESTARTSYS; read_pending: mutex_lock(&counter->mutex); @@ -925,11 +1007,12 @@ read_pending: /* Switch irq buffer: */ usrdata = perf_switch_irq_data(counter); - if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + res2 = perf_copy_usrdata(usrdata, buf + res, count - res); + if (res2 < 0) { if (!res) res = -EFAULT; } else { - res = count; + res += res2; } out: mutex_unlock(&counter->mutex); @@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, */ if (group_leader->ctx != ctx) goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (hw_event.exclusive || hw_event.pinned) + goto err_put_context; } ret = -EINVAL; @@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - child_counter->hw_ops->disable(child_counter); - cpuctx->active_oncpu--; - child_ctx->nr_active--; - child_counter->oncpu = -1; - } + counter_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); -- cgit v1.2.3 From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 17 Jan 2009 18:10:22 +1100 Subject: perf_counter: Add counter enable/disable ioctls Impact: New perf_counter features This primarily adds a way for perf_counter users to enable and disable counters and groups. Enabling or disabling a counter or group also enables or disables all of the child counters that have been cloned from it to monitor children of the task monitored by the top-level counter. The userspace interface to enable/disable counters is via ioctl on the counter file descriptor. Along the way this extends the code that handles child counters to handle child counter groups properly. A group with multiple counters will be cloned to child tasks if and only if the group leader has the hw_event.inherit bit set - if it is set the whole group is cloned as a group in the child task. In order to be able to enable or disable all child counters of a given top-level counter, we need a way to find them all. Hence I have added a child_list field to struct perf_counter, which is the head of the list of children for a top-level counter, or the link in that list for a child counter. That list is protected by the perf_counter.mutex field. This also adds a mutex to the perf_counter_context struct. Previously the list of counters was protected just by the lock field in the context, which meant that perf_counter_init_task had to take that lock and then take whatever lock/mutex protects the top-level counter's child_list. But the counter enable/disable functions need to take that lock in order to traverse the list, then for each counter take the lock in that counter's context in order to change the counter's state safely, which will lead to a deadlock. To solve this, we now have both a mutex and a spinlock in the context, and taking either is sufficient to ensure the list of counters can't change - you have to take both before changing the list. Now perf_counter_init_task takes the mutex instead of the lock (which incidentally means that inherit_counter can use GFP_KERNEL instead of GFP_ATOMIC) and thus avoids the possible deadlock. Similarly the new enable/disable functions can take the mutex while traversing the list of child counters without incurring a possible deadlock when the counter manipulation code locks the context for a child counter. We also had an misfeature that the first counter added to a context would possibly not go on until the next sched-in, because we were using ctx->nr_active to detect if the context was running on a CPU. But nr_active is the number of active counters, and if that was zero (because the context didn't have any counters yet) it would look like the context wasn't running on a cpu and so the retry code in __perf_install_in_context wouldn't retry. So this adds an 'is_active' field that is set when the context is on a CPU, even if it has no counters. The is_active field is only used for task contexts, not for per-cpu contexts. If we enable a subsidiary counter in a group that is active on a CPU, and the arch code can't enable the counter, then we have to pull the whole group off the CPU. We do this with group_sched_out, which gets moved up in the file so it comes before all its callers. This also adds similar logic to __perf_install_in_context so that the "all on, or none" invariant of groups is preserved when adding a new counter to a group. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 455 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 396 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index faf671b2956..1ac18daa424 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter, cpuctx->exclusive = 0; } +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex held. + * Must be called with counter->mutex and ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -215,6 +237,99 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; + + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + /* + * If the counter is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + if (counter == counter->group_leader) + group_sched_out(counter, cpuctx, ctx); + else + counter_sched_out(counter, cpuctx, ctx); + counter->state = PERF_COUNTER_STATE_OFF; + } + + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Disable a counter. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_disable, + counter, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_counter_disable, counter); + + spin_lock_irq(&ctx->lock); + /* + * If the counter is still active, we need to retry the cross-call. + */ + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_OFF; + + spin_unlock_irq(&ctx->lock); +} + +/* + * Disable a counter and all its children. + */ +static void perf_counter_disable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_disable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_disable(child); + mutex_unlock(&counter->mutex); +} + static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; @@ -327,23 +443,40 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + /* + * Don't put the counter on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (counter->state != PERF_COUNTER_STATE_INACTIVE || + (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + goto unlock; + /* * An exclusive counter can't go on if there are already active * hardware counters, and no hardware counter can go on if there * is already an exclusive counter on. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE && - !group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) err = -EEXIST; else err = counter_sched_in(counter, cpuctx, ctx, cpu); - if (err && counter->hw_event.pinned) - counter->state = PERF_COUNTER_STATE_ERROR; + if (err) { + /* + * This counter couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the counter group is pinned then put it in error state. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; + unlock: hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info) * If the counter is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_counter_context *ctx, @@ -387,7 +522,7 @@ retry: /* * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list_entry)) { + if (ctx->is_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -404,26 +539,131 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) { - struct perf_counter *counter; + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; + unsigned long flags; + int err; - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) return; - counter_sched_out(group_counter, cpuctx, ctx); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto unlock; + counter->state = PERF_COUNTER_STATE_INACTIVE; /* - * Schedule out siblings (if any): + * If the counter is in a group and isn't the group leader, + * then don't put it on unless the group is on. */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); + if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + goto unlock; - if (group_counter->hw_event.exclusive) - cpuctx->exclusive = 0; + if (!group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + + if (err) { + /* + * If this counter can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } + + unlock: + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Enable a counter. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_enable, + counter, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto out; + + /* + * If the counter is in error state, clear that first. + * That way, if we see the counter in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_counter_enable, counter); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the counter is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_OFF) + counter->state = PERF_COUNTER_STATE_INACTIVE; + out: + spin_unlock_irq(&ctx->lock); +} + +/* + * Enable a counter and all its children. + */ +static void perf_counter_enable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_enable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_enable(child); + mutex_unlock(&counter->mutex); } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_counter *counter; u64 flags; + spin_lock(&ctx->lock); + ctx->is_active = 0; if (likely(!ctx->nr_counters)) - return; + goto out; - spin_lock(&ctx->lock); flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, u64 flags; int can_add_hw = 1; + spin_lock(&ctx->lock); + ctx->is_active = 1; if (likely(!ctx->nr_counters)) - return; + goto out; - spin_lock(&ctx->lock); flags = hw_perf_save_disable(); /* @@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; + mutex_lock(&ctx->mutex); mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); + mutex_unlock(&ctx->mutex); kfree(counter); @@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_counter *counter = file->private_data; + int err = 0; + + switch (cmd) { + case PERF_COUNTER_IOC_ENABLE: + perf_counter_enable_family(counter); + break; + case PERF_COUNTER_IOC_DISABLE: + perf_counter_disable_family(counter); + break; + default: + err = -ENOTTY; + } + return err; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, }; static int cpu_clock_perf_counter_enable(struct perf_counter *counter) @@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + INIT_LIST_HEAD(&counter->child_list); + counter->irqdata = &counter->data[0]; counter->usrdata = &counter->data[1]; counter->cpu = cpu; @@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, goto err_free_put_context; counter->filp = counter_file; + mutex_lock(&ctx->mutex); perf_install_in_context(ctx, counter, cpu); + mutex_unlock(&ctx->mutex); fput_light(counter_file, fput_needed2); @@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, { memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); ctx->task = task; } @@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx, /* * inherit a counter from parent task to child task: */ -static int +static struct perf_counter * inherit_counter(struct perf_counter *parent_counter, struct task_struct *parent, struct perf_counter_context *parent_ctx, struct task_struct *child, + struct perf_counter *group_leader, struct perf_counter_context *child_ctx) { struct perf_counter *child_counter; + /* + * Instead of creating recursive hierarchies of counters, + * we link inherited counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_counter->parent) + parent_counter = parent_counter->parent; + child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, NULL, - GFP_ATOMIC); + parent_counter->cpu, group_leader, + GFP_KERNEL); if (!child_counter) - return -ENOMEM; + return NULL; /* * Link it up in the child's context: @@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter, */ atomic_long_inc(&parent_counter->filp->f_count); + /* + * Link this into the parent counter's child list + */ + mutex_lock(&parent_counter->mutex); + list_add_tail(&child_counter->child_list, &parent_counter->child_list); + + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + + mutex_unlock(&parent_counter->mutex); + + return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *leader; + struct perf_counter *sub; + + leader = inherit_counter(parent_counter, parent, parent_ctx, + child, NULL, child_ctx); + if (!leader) + return -ENOMEM; + list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { + if (!inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx)) + return -ENOMEM; + } return 0; } +static void sync_child_counter(struct perf_counter *child_counter, + struct perf_counter *parent_counter) +{ + u64 parent_val, child_val; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + /* + * Remove this counter from the parent's list + */ + mutex_lock(&parent_counter->mutex); + list_del_init(&child_counter->child_list); + mutex_unlock(&parent_counter->mutex); + + /* + * Release the parent counter, if this was the last + * reference to it. + */ + fput(parent_counter->filp); +} + static void __perf_counter_exit_task(struct task_struct *child, struct perf_counter *child_counter, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - u64 parent_val, child_val; + struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - counter_sched_out(child_counter, cpuctx, child_ctx); + group_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); @@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child, * that are still around due to the child reference. These * counters need to be zapped - but otherwise linger. */ - if (!parent_counter) - return; - - parent_val = atomic64_read(&parent_counter->count); - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - - fput(parent_counter->filp); + if (parent_counter) { + sync_child_counter(child_counter, parent_counter); + list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, + list_entry) { + if (sub->parent) + sync_child_counter(sub, sub->parent); + kfree(sub); + } + } kfree(child_counter); } /* - * When a child task exist, feed back counter values to parent counters. + * When a child task exits, feed back counter values to parent counters. * - * Note: we are running in child context, but the PID is not hashed + * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. */ void perf_counter_exit_task(struct task_struct *child) @@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child) void perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter *counter, *parent_counter; + struct perf_counter *counter; struct task_struct *parent = current; - unsigned long flags; child_ctx = &child->perf_counter_ctx; parent_ctx = &parent->perf_counter_ctx; @@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child) * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ - spin_lock_irqsave(&parent_ctx->lock, flags); + mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { - if (!counter->hw_event.inherit || counter->group_leader != counter) + if (!counter->hw_event.inherit) continue; - /* - * Instead of creating recursive hierarchies of counters, - * we link inheritd counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - parent_counter = counter; - if (counter->parent) - parent_counter = counter->parent; - - if (inherit_counter(parent_counter, parent, + if (inherit_group(counter, parent, parent_ctx, child, child_ctx)) break; } - spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); } static void __cpuinit perf_counter_init_cpu(int cpu) @@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info) list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) __perf_counter_remove_from_context(counter); - } static void perf_counter_exit_cpu(int cpu) { + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); } #else static inline void perf_counter_exit_cpu(int cpu) { } -- cgit v1.2.3 From 65d370862f64973611a271ced61864b5f9bb6fc0 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 29 Jan 2009 14:06:52 +0100 Subject: perfcounters: fix refcounting bug don't kfree in use counters. Running... while true; do perfstat -e 1 -c true; done ...on all cores for a while doesn't seem to be eating ram, and my oops is gone. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1ac18daa424..f27a7e9f3c4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1934,7 +1934,8 @@ __perf_counter_exit_task(struct task_struct *child, } } - kfree(child_counter); + if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count)) + kfree(child_counter); } /* -- cgit v1.2.3 From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 78 +++++++++++++++++++++++++++++---------------------- kernel/sched.c | 17 +++++++++++ 2 files changed, 62 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f27a7e9f3c4..544193cbc47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include /* * Each CPU has a list of per CPU counters: @@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx, { struct task_struct *task = ctx->task; - counter->ctx = ctx; if (!task) { /* * Per cpu counters are installed via an smp call and @@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -static u64 get_page_faults(void) +#ifdef CONFIG_VM_EVENT_COUNTERS +#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] +#else +#define cpu_page_faults() 0 +#endif + +static u64 get_page_faults(struct perf_counter *counter) { - struct task_struct *curr = current; + struct task_struct *curr = counter->ctx->task; - return curr->maj_flt + curr->min_flt; + if (curr) + return curr->maj_flt + curr->min_flt; + return cpu_page_faults(); } static void page_faults_perf_counter_update(struct perf_counter *counter) @@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_page_faults(); + now = get_page_faults(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - /* - * page-faults is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; -static u64 get_context_switches(void) +static u64 get_context_switches(struct perf_counter *counter) { - struct task_struct *curr = current; + struct task_struct *curr = counter->ctx->task; - return curr->nvcsw + curr->nivcsw; + if (curr) + return curr->nvcsw + curr->nivcsw; + return cpu_nr_switches(smp_processor_id()); } static void context_switches_perf_counter_update(struct perf_counter *counter) @@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(); + now = get_context_switches(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - /* - * ->nvcsw + curr->nivcsw is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); return 0; } @@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; -static inline u64 get_cpu_migrations(void) +static inline u64 get_cpu_migrations(struct perf_counter *counter) { - return current->se.nr_migrations; + struct task_struct *curr = counter->ctx->task; + + if (curr) + return curr->se.nr_migrations; + return cpu_nr_migrations(smp_processor_id()); } static void cpu_migrations_perf_counter_update(struct perf_counter *counter) @@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(); + now = get_cpu_migrations(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - /* - * se.nr_migrations is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); return 0; } @@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: - hw_ops = &perf_ops_task_clock; + /* + * If the user instantiates this as a per-cpu counter, + * use the cpu_clock counter instead. + */ + if (counter->ctx->task) + hw_ops = &perf_ops_task_clock; + else + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: hw_ops = &perf_ops_page_faults; @@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, + struct perf_counter_context *ctx, struct perf_counter *group_leader, gfp_t gfpflags) { @@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; + counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) + else hw_ops = hw_perf_counter_init(counter); if (!hw_ops) { @@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); + counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + GFP_KERNEL); if (!counter) goto err_put_context; @@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter, parent_counter = parent_counter->parent; child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, group_leader, - GFP_KERNEL); + parent_counter->cpu, child_ctx, + group_leader, GFP_KERNEL); if (!child_counter) return NULL; /* * Link it up in the child's context: */ - child_counter->ctx = child_ctx; child_counter->task = child; list_add_counter(child_counter, child_ctx); child_ctx->nr_counters++; diff --git a/kernel/sched.c b/kernel/sched.c index 8db1a4cf208..173768f142a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -558,6 +558,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; + new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2810,6 +2812,21 @@ unsigned long nr_active(void) return running + uninterruptible; } +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_switches(cpu) - number of context switches on that cpu + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_switches(int cpu) +{ + return cpu_rq(cpu)->nr_switches; +} + +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; +} + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.3 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 544193cbc47..89d5e3fe970 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter) { const struct hw_perf_counter_ops *hw_ops = NULL; + /* + * Software counters (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv)) + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: + if (counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv) + break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - hw_ops = &perf_ops_page_faults; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel)) + hw_ops = &perf_ops_page_faults; break; case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_context_switches; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_context_switches; break; case PERF_COUNT_CPU_MIGRATIONS: - hw_ops = &perf_ops_cpu_migrations; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_cpu_migrations; break; default: break; -- cgit v1.2.3 From 5af759176cc767e7426f89764bde4996ebaaf419 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 11 Feb 2009 10:53:37 +0100 Subject: perfcounters: fix use after free in perf_release() running... while true; do foo -d 1 -f 1 -c 100000 & sleep 1 kerneltop -d 1 -f 1 -e 1 -c 25000 -p `pidof foo` done while true; do killall foo; killall kerneltop; sleep 2 done ...in two shells with SLUB_DEBUG enabled produces flood of: BUG task_struct: Poison overwritten. Fix the use-after-free bug in perf_release(). Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 89d5e3fe970..e0576c3fdb5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1145,12 +1145,12 @@ static int perf_release(struct inode *inode, struct file *file) mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); - put_context(ctx); mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); kfree(counter); + put_context(ctx); return 0; } -- cgit v1.2.3 From 4bcf349a0f90d1e69eb35c6df0fa285c886c1cd6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 13:53:19 +0100 Subject: perfcounters: fix refcounting bug, take 2 Only free child_counter if it has a parent; if it doesn't, then it has a file pointing to it and we'll free it in perf_release. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e0576c3fdb5..fcefb0a726f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1958,14 +1958,13 @@ __perf_counter_exit_task(struct task_struct *child, sync_child_counter(child_counter, parent_counter); list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, list_entry) { - if (sub->parent) + if (sub->parent) { sync_child_counter(sub, sub->parent); - kfree(sub); + kfree(sub); + } } - } - - if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count)) kfree(child_counter); + } } /* -- cgit v1.2.3 From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Feb 2009 22:10:34 +1100 Subject: perfcounters: make context switch and migration software counters work again Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the context switch and migration software counters to report zero always. With that commit, the software counters only count events that occur between sched-in and sched-out for a task. This is necessary for the counter enable/disable prctls and ioctls to work. However, the context switch and migration counts are incremented after sched-out for one task and before sched-in for the next. Since the increment doesn't occur while a task is scheduled in (as far as the software counters are concerned) it doesn't count towards any counter. Thus the context switch and migration counters need to count events that occur at any time, provided the counter is enabled, not just those that occur while the task is scheduled in (from the perf_counter subsystem's point of view). The problem though is that the software counter code can't tell the difference between being enabled and being scheduled in, and between being disabled and being scheduled out, since we use the one pair of enable/disable entry points for both. That is, the high-level disable operation simply arranges for the counter to not be scheduled in any more, and the high-level enable operation arranges for it to be scheduled in again. One way to solve this would be to have sched_in/out operations in the hw_perf_counter_ops struct as well as enable/disable. However, this takes a simpler approach: it adds a 'prev_state' field to the perf_counter struct that allows a counter's enable method to know whether the counter was previously disabled or just inactive (scheduled out), and therefore whether the enable method is being called as a result of a high-level enable or a schedule-in operation. This then allows the context switch, migration and page fault counters to reset their hw.prev_count value in their enable functions only if they are called as a result of a high-level enable operation. Although page faults would normally only occur while the counter is scheduled in, this changes the page fault counter code too in case there are ever circumstances where page faults get counted against a task while its counters are not scheduled in. Reported-by: Jaswinder Singh Rajput Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fcefb0a726f..ad62965828d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; /* * Don't put the counter on if it is disabled or if @@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; @@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; + group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) static int task_clock_perf_counter_enable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); - - atomic64_set(&counter->hw.prev_count, now); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + task_clock_perf_counter_val(counter, 0)); return 0; } @@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_context_switches(counter)); return 0; } @@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_cpu_migrations(counter)); return 0; } -- cgit v1.2.3 From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ad62965828d..16b14ba99d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, int cpu, int group_fd) +SYSCALL_DEFINE4(perf_counter_open, + const struct perf_counter_hw_event __user *, hw_event_uptr, + pid_t, pid, int, cpu, int, group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; -- cgit v1.2.3 From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2009 20:36:51 +1100 Subject: perfcounters: provide expansion room in the ABI Impact: ABI change This expands several fields in the perf_counter_hw_event struct and adds a "flags" argument to the perf_counter_open system call, in order that features can be added in future without ABI changes. In particular the record_type field is expanded to 64 bits, and the space for flag bits has been expanded from 32 to 64 bits. This also adds some new fields: * read_format (64 bits) is intended to provide a way to specify what userspace wants to get back when it does a read() on a simple (non-interrupting) counter; * exclude_idle (1 bit) provides a way for userspace to ask that events that occur when the cpu is idle be excluded; * extra_config_len will provide a way for userspace to supply an arbitrary amount of extra machine-specific PMU configuration data immediately following the perf_counter_hw_event struct, to allow sophisticated users to program things such as instruction matching CAMs and address range registers; * __reserved_3 and __reserved_4 provide space for future expansion. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16b14ba99d3..b2e838959f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, } /** - * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ -SYSCALL_DEFINE4(perf_counter_open, +SYSCALL_DEFINE5(perf_counter_open, const struct perf_counter_hw_event __user *, hw_event_uptr, - pid_t, pid, int, cpu, int, group_fd) + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; @@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open, int fput_needed2 = 0; int ret; + /* for future expandability... */ + if (flags) + return -EINVAL; + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; -- cgit v1.2.3 From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:48:52 +0100 Subject: x86, hw-branch-tracer: keep resources on stop Distinguish init/reset and start/stop: init/reset will allocate and release bts tracing resources stop/start will suspend and resume bts tracing Return an error on init() if no cpu can be traced. Signed-off-by: Markus Metzger LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..a99a04c5e9c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -19,7 +19,7 @@ #include "trace_output.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) /* * The tracer lock protects the below per-cpu tracer array. @@ -33,53 +33,68 @@ */ static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; /* - * Start tracing on the current cpu. + * Initialize the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(void *arg) { if (this_tracer) ds_release_bts(this_tracer); - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - BTS_KERNEL); + this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(this_tracer)) { this_tracer = NULL; return; } } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { + int cpu, avail; + spin_lock(&bts_tracer_lock); - on_each_cpu(bts_trace_start_cpu, NULL, 1); - trace_hw_branches_enabled = 1; + hw_branch_trace = tr; + + on_each_cpu(bts_trace_init_cpu, NULL, 1); + + /* Check on how many cpus we could enable tracing */ + avail = 0; + for_each_online_cpu(cpu) + if (per_cpu(tracer, cpu)) + avail++; + + trace_hw_branches_enabled = (avail ? 1 : 0); + trace_hw_branches_suspended = 0; spin_unlock(&bts_tracer_lock); + + + /* If we could not enable tracing on a single cpu, we fail. */ + return avail ? 0 : -EOPNOTSUPP; } /* - * Stop tracing on the current cpu. + * Release the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_release_cpu(void *arg) { if (this_tracer) { ds_release_bts(this_tracer); @@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg) } } -static void bts_trace_stop(struct trace_array *tr) +static void bts_trace_reset(struct trace_array *tr) { spin_lock(&bts_tracer_lock); + on_each_cpu(bts_trace_release_cpu, NULL, 1); trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_stop_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Resume tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_resume_cpu(void *arg) +{ + if (this_tracer) + ds_resume_bts(this_tracer); +} + +static void bts_trace_start(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_resume_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Suspend tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_suspend_cpu(void *arg) +{ + if (this_tracer) + ds_suspend_bts(this_tracer); +} + +static void bts_trace_stop(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + trace_hw_branches_suspended = 1; spin_unlock(&bts_tracer_lock); } @@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + + if (trace_hw_branches_suspended) + smp_call_function_single(cpu, bts_trace_suspend_cpu, + NULL, 1); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); break; } @@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) -{ - hw_branch_trace = tr; - - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - bts_trace_stop(tr); -} - static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, "# CPU# TO <- FROM\n"); @@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) */ static void trace_bts_cpu(void *arg) { - struct trace_array *tr = (struct trace_array *) arg; + struct trace_array *tr = (struct trace_array *)arg; const struct bts_trace *trace; unsigned char *at; @@ -276,7 +326,8 @@ void trace_hw_branch_oops(void) { spin_lock(&bts_tracer_lock); - trace_bts_cpu(hw_branch_trace); + if (trace_hw_branches_enabled) + trace_bts_cpu(hw_branch_trace); spin_unlock(&bts_tracer_lock); } -- cgit v1.2.3 From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:50:27 +0100 Subject: x86, hw-branch-tracer: add selftest Add a selftest for the hw-branch-tracer. Signed-off-by: Markus Metzger LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_hw_branches.c | 5 +++- kernel/trace/trace_selftest.c | 53 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 56ce34d90b0..e7fbc826f1e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index a99a04c5e9c..4ca82700c04 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly = .start = bts_trace_start, .stop = bts_trace_stop, .open = trace_bts_prepare, - .close = trace_bts_close + .close = trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */ }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index f907a2b2902..3c7b797d0d2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: + case TRACE_HW_BRANCHES: return 1; } return 0; @@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr) +{ + unsigned long count; + int ret; + struct trace_iterator iter; + struct tracer tracer; + + if (!trace->open) { + printk(KERN_CONT "missing open function..."); + return -1; + } + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* + * The hw-branch tracer needs to collect the trace from the various + * cpu trace buffers - before tracing is stopped. + */ + memset(&iter, 0, sizeof(iter)); + memcpy(&tracer, trace, sizeof(tracer)); + + iter.trace = &tracer; + iter.tr = tr; + iter.pos = -1; + mutex_init(&iter.mutex); + + trace->open(&iter); + + mutex_destroy(&iter.mutex); + + tracing_stop(); + + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT "no entries found.."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ -- cgit v1.2.3 From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:54:40 +0100 Subject: x86, bts: cleanups Impact: cleanup, no code changed Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 6 +++--- kernel/trace/trace_selftest.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 4ca82700c04..8b2109a6c61 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,5 +1,5 @@ /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 @@ -15,8 +15,8 @@ #include -#include "trace.h" #include "trace_output.h" +#include "trace.h" #define BTS_BUFFER_SIZE (1 << 13) @@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m) static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) { + unsigned long symflags = TRACE_ITER_SYM_OFFSET; struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; - unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3c7b797d0d2..b9109126706 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -698,10 +699,10 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; struct trace_iterator iter; struct tracer tracer; + unsigned long count; + int ret; if (!trace->open) { printk(KERN_CONT "missing open function..."); -- cgit v1.2.3 From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 13:38:36 -0400 Subject: tracing: add handler to trace_stat Currently, if a trace_stat user wants a handle to some private data, the trace_stat infrastructure does not supply a way to do that. This patch passes the trace_stat structure to the start function of the trace_stat code. Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 4 ++-- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_stat.h | 2 +- kernel/trace/trace_workqueue.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ad8c22efff4..e6e32912ffb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cf..f8f48d84b2c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3..f3546a2cd82 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf..ee533c2e161 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; -- cgit v1.2.3 From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Mar 2009 12:50:56 -0400 Subject: tracing: add function profiler Impact: new profiling feature This patch adds a function profiler. In debugfs/tracing/ two new files are created. function_profile_enabled - to enable or disable profiling trace_stat/functions - the profiled functions. For example: echo 1 > /debugfs/tracing/function_profile_enabled ./hackbench 50 echo 0 > /debugfs/tracing/function_profile_enabled yields: cat /debugfs/tracing/trace_stat/functions Function Hit -------- --- _spin_lock 10106442 _spin_unlock 10097492 kfree 6013704 _spin_unlock_irqrestore 4423941 _spin_lock_irqsave 4406825 __phys_addr 4181686 __slab_free 4038222 dput 4030130 path_put 4023387 unroll_tree_refs 4019532 [...] The most hit functions are listed first. Functions that are not hit are not listed. This feature depends on and uses dynamic function tracing. When the function profiling is disabled, no overhead occurs. But it still takes up around 300KB to hold the data, thus it is not recomended to keep it enabled for systems low on memory. When a '1' is echoed into the function_profile_enabled file, the counters for is function is reset back to zero. Thus you can see what functions are hit most by different programs. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 19 +++ kernel/trace/ftrace.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 330 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8a4d7293104..95e9ad5735d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -376,6 +377,24 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on DYNAMIC_FTRACE + default n + help + This option enables the kernel function profiler. When the dynamic + function tracing is enabled, a counter is added into the function + records used by the dynamic function tracer. A file is created in + debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + This takes up around 320K more memory. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf15..11f364c776d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include "trace.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -261,7 +262,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records; } \ } +#ifdef CONFIG_FUNCTION_PROFILER +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static void * +function_stat_next(void *v, int idx) +{ + struct dyn_ftrace *rec = v; + struct ftrace_page *pg; + + pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + } + + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED || + !(rec->flags & FTRACE_FL_CONVERTED) || + /* ignore non hit functions */ + !rec->counter) + goto again; + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&ftrace_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct dyn_ftrace *a = p1; + struct dyn_ftrace *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_init(int nr_funcs) +{ + unsigned long addr; + int order; + int size; + + /* + * We are profiling all functions, lets make it 1/4th of the + * number of functions that are in core kernel. So we have to + * iterate 4 times. + */ + order = (sizeof(struct hlist_head) * nr_funcs) / 4; + order = get_order(order); + size = 1 << (PAGE_SHIFT + order); + + pr_info("Allocating %d KB for profiler hash\n", size >> 10); + + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!addr) { + pr_warning("Could not allocate function profiler hash\n"); + return; + } + + ftrace_profile_hash = (void *)addr; + + /* + * struct hlist_head should be a pointer of 4 or 8 bytes. + * And a simple bit manipulation can be done, but if for + * some reason struct hlist_head is not a mulitple of 2, + * then we play it safe, and simply count. This function + * is done once at boot up, so it is not that critical in + * performance. + */ + + size--; + size /= sizeof(struct hlist_head); + + for (; size; size >>= 1) + ftrace_profile_bits++; + + pr_info("Function profiler has %d hash buckets\n", + 1 << ftrace_profile_bits); + + return; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void ftrace_profile_reset(void) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + rec->counter = 0; + } while_for_each_ftrace_rec(); +} + +static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +{ + struct dyn_ftrace *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long flags; + unsigned long key; + + if (!ftrace_profile_hash) + return NULL; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + local_irq_save(flags); + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + goto out; + } + rec = NULL; + out: + local_irq_restore(flags); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) + goto out; + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (!ftrace_profile_hash) { + pr_info("Can not enable hash due to earlier problems\n"); + return -ENODEV; + } + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ftrace_profile_reset(); + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ + unsigned long key; + + if (!ftrace_profile_hash) + return; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ + mutex_lock(&ftrace_profile_lock); + hlist_del(&rec->node); + mutex_unlock(&ftrace_profile_lock); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_init(int nr_funcs) +{ +} +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ +} +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + !(rec->flags & FTRACE_FL_FREE)) { ftrace_free_rec(rec); + ftrace_profile_release(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; + ftrace_add_profile(rec); + return rec; } @@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + ftrace_profile_debugfs(d_tracer); + return 0; } @@ -2225,6 +2532,8 @@ void __init ftrace_init(void) if (ret) goto failed; + ftrace_profile_init(count); + last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, -- cgit v1.2.3 From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 17:12:36 -0400 Subject: tracing: move function profiler data out of function struct Impact: reduce size of memory in function profiler The function profiler originally introduces its counters into the function records itself. There is 20 thousand different functions on a normal system, and that is adding 20 thousand counters for profiling event when not needed. A normal run of the profiler yields only a couple of thousand functions executed, depending on what is being profiled. This means we have around 18 thousand useless counters. This patch rectifies this by moving the data out of the function records used by dynamic ftrace. Data is preallocated to hold the functions when the profiling begins. Checks are made during profiling to see if more recorcds should be allocated, and they are allocated if it is safe to do so. This also removes the dependency from using dynamic ftrace, and also removes the overhead by having it enabled. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 10 +- kernel/trace/ftrace.c | 440 +++++++++++++++++++++++++++++--------------------- 2 files changed, 263 insertions(+), 187 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95e9ad5735d..8a4136096d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -379,20 +379,16 @@ config DYNAMIC_FTRACE config FUNCTION_PROFILER bool "Kernel function profiler" - depends on DYNAMIC_FTRACE + depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. When the dynamic - function tracing is enabled, a counter is added into the function - records used by the dynamic function tracer. A file is created in - debugfs called function_profile_enabled which defaults to zero. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A file in the trace_stats directory called functions, that show the list of functions that have been hit and their counters. - This takes up around 320K more memory. - If in doubt, say N config FTRACE_MCOUNT_RECORD diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11f364c776d..24dac448cdc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void) #endif } -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct rcu_head rcu; +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; }; -enum { - FTRACE_ENABLE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; }; -static int ftrace_filtered; +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) -static struct dyn_ftrace *ftrace_new_addrs; +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static DEFINE_MUTEX(ftrace_regex_lock); - -struct ftrace_page { - struct ftrace_page *next; - int index; - struct dyn_ftrace records[]; -}; +/* TODO: make these percpu, to prevent cache line bouncing */ +static struct ftrace_profile_page *profile_pages_start; +static struct ftrace_profile_page *profile_pages; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -static struct dyn_ftrace *ftrace_free_records; - -/* - * This is a double for. Do not use 'break' to break out of the loop, - * you must use a goto. - */ -#define do_for_each_ftrace_rec(pg, rec) \ - for (pg = ftrace_pages_start; pg; pg = pg->next) { \ - int _____i; \ - for (_____i = 0; _____i < pg->index; _____i++) { \ - rec = &pg->records[_____i]; - -#define while_for_each_ftrace_rec() \ - } \ - } - -#ifdef CONFIG_FUNCTION_PROFILER static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); +static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static raw_spinlock_t ftrace_profile_rec_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static void * function_stat_next(void *v, int idx) { - struct dyn_ftrace *rec = v; - struct ftrace_page *pg; + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; - pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: rec++; @@ -330,27 +291,22 @@ function_stat_next(void *v, int idx) if (!pg) return NULL; rec = &pg->records[0]; + if (!rec->counter) + goto again; } - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || - !(rec->flags & FTRACE_FL_CONVERTED) || - /* ignore non hit functions */ - !rec->counter) - goto again; - return rec; } static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&ftrace_pages_start->records[0], 0); + return function_stat_next(&profile_pages_start->records[0], 0); } static int function_stat_cmp(void *p1, void *p2) { - struct dyn_ftrace *a = p1; - struct dyn_ftrace *b = p2; + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; @@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { - struct dyn_ftrace *rec = v; + struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -387,115 +343,191 @@ static struct tracer_stat function_stats = { .stat_show = function_stat_show }; -static void ftrace_profile_init(int nr_funcs) +static void ftrace_profile_reset(void) { - unsigned long addr; - int order; - int size; + struct ftrace_profile_page *pg; - /* - * We are profiling all functions, lets make it 1/4th of the - * number of functions that are in core kernel. So we have to - * iterate 4 times. - */ - order = (sizeof(struct hlist_head) * nr_funcs) / 4; - order = get_order(order); - size = 1 << (PAGE_SHIFT + order); - - pr_info("Allocating %d KB for profiler hash\n", size >> 10); + pg = profile_pages = profile_pages_start; - addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!addr) { - pr_warning("Could not allocate function profiler hash\n"); - return; + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; } - ftrace_profile_hash = (void *)addr; + memset(ftrace_profile_hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} - /* - * struct hlist_head should be a pointer of 4 or 8 bytes. - * And a simple bit manipulation can be done, but if for - * some reason struct hlist_head is not a mulitple of 2, - * then we play it safe, and simply count. This function - * is done once at boot up, so it is not that critical in - * performance. - */ +int ftrace_profile_pages_init(void) +{ + struct ftrace_profile_page *pg; + int i; - size--; - size /= sizeof(struct hlist_head); + /* If we already allocated, do nothing */ + if (profile_pages) + return 0; - for (; size; size >>= 1) - ftrace_profile_bits++; + profile_pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!profile_pages) + return -ENOMEM; - pr_info("Function profiler has %d hash buckets\n", - 1 << ftrace_profile_bits); + pg = profile_pages_start = profile_pages; - return; + /* allocate 10 more pages to start */ + for (i = 0; i < 10; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + /* + * We only care about allocating profile_pages, if + * we failed to allocate here, hopefully we will allocate + * later. + */ + if (!pg->next) + break; + pg = pg->next; + } + + return 0; } -static ssize_t -ftrace_profile_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_profile_init(void) { - char buf[64]; - int r; + int size; - r = sprintf(buf, "%u\n", ftrace_profile_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} + if (ftrace_profile_hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(); + return 0; + } -static void ftrace_profile_reset(void) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; - do_for_each_ftrace_rec(pg, rec) { - rec->counter = 0; - } while_for_each_ftrace_rec(); + ftrace_profile_hash = + kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!ftrace_profile_hash) + return -ENOMEM; + + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + + /* Preallocate a few pages */ + if (ftrace_profile_pages_init() < 0) { + kfree(ftrace_profile_hash); + ftrace_profile_hash = NULL; + return -ENOMEM; + } + + return 0; } -static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +/* interrupts must be disabled */ +static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) { - struct dyn_ftrace *rec; + struct ftrace_profile *rec; struct hlist_head *hhd; struct hlist_node *n; - unsigned long flags; unsigned long key; - if (!ftrace_profile_hash) - return NULL; - key = hash_long(ip, ftrace_profile_bits); hhd = &ftrace_profile_hash[key]; if (hlist_empty(hhd)) return NULL; - local_irq_save(flags); hlist_for_each_entry_rcu(rec, n, hhd, node) { if (rec->ip == ip) - goto out; + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +/* Interrupts must be disabled calling this */ +static struct ftrace_profile * +ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion */ + if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + goto out; + + __raw_spin_lock(&ftrace_profile_rec_lock); + + /* Try to always keep another page available */ + if (!profile_pages->next && alloc_safe) + profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + + /* + * Try to find the function again since another + * task on another CPU could have added it + */ + rec = ftrace_find_profiled_func(ip); + if (rec) + goto out_unlock; + + if (profile_pages->index == PROFILES_PER_PAGE) { + if (!profile_pages->next) + goto out_unlock; + profile_pages = profile_pages->next; } - rec = NULL; + + rec = &profile_pages->records[profile_pages->index++]; + rec->ip = ip; + ftrace_add_profile(rec); + + out_unlock: + __raw_spin_unlock(&ftrace_profile_rec_lock); out: - local_irq_restore(flags); + atomic_dec(&__get_cpu_var(ftrace_profile_disable)); return rec; } +/* + * If we are not in an interrupt, or softirq and + * and interrupts are disabled and preemption is not enabled + * (not in a spinlock) then it should be safe to allocate memory. + */ +static bool ftrace_safe_to_allocate(void) +{ + return !in_interrupt() && irqs_disabled() && !preempt_count(); +} + static void function_profile_call(unsigned long ip, unsigned long parent_ip) { - struct dyn_ftrace *rec; + struct ftrace_profile *rec; unsigned long flags; + bool alloc_safe; if (!ftrace_profile_enabled) return; + alloc_safe = ftrace_safe_to_allocate(); + local_irq_save(flags); rec = ftrace_find_profiled_func(ip); - if (!rec) - goto out; + if (!rec) { + rec = ftrace_profile_alloc(ip, alloc_safe); + if (!rec) + goto out; + } rec->counter++; out: @@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, char buf[64]; int ret; - if (!ftrace_profile_hash) { - pr_info("Can not enable hash due to earlier problems\n"); - return -ENODEV; - } - if (cnt >= sizeof(buf)) return -EINVAL; @@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, mutex_lock(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { - ftrace_profile_reset(); + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + register_ftrace_function(&ftrace_profile_ops); ftrace_profile_enabled = 1; } else { @@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_function(&ftrace_profile_ops); } } + out: mutex_unlock(&ftrace_profile_lock); filp->f_pos += cnt; @@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, @@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) "'function_profile_enabled' entry\n"); } -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ - unsigned long key; - - if (!ftrace_profile_hash) - return; - - key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); -} - -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ - mutex_lock(&ftrace_profile_lock); - hlist_del(&rec->node); - mutex_unlock(&ftrace_profile_lock); -} - #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_init(int nr_funcs) -{ -} -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ -} static void ftrace_profile_debugfs(struct dentry *d_tracer) { } -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ -} #endif /* CONFIG_FUNCTION_PROFILER */ +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +#ifdef CONFIG_DYNAMIC_FTRACE + +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +# error Dynamic ftrace depends on MCOUNT_RECORD +#endif + +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_probe { + struct hlist_node node; + struct ftrace_probe_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct rcu_head rcu; +}; + +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), + FTRACE_START_FUNC_RET = (1 << 5), + FTRACE_STOP_FUNC_RET = (1 << 6), +}; + +static int ftrace_filtered; + +static struct dyn_ftrace *ftrace_new_addrs; + +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static struct dyn_ftrace *ftrace_free_records; + +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) { + !(rec->flags & FTRACE_FL_FREE)) ftrace_free_rec(rec); - ftrace_profile_release(rec); - } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; - ftrace_add_profile(rec); - return rec; } @@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - ftrace_profile_debugfs(d_tracer); - return 0; } @@ -2532,8 +2611,6 @@ void __init ftrace_init(void) if (ret) goto failed; - ftrace_profile_init(count); - last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, @@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_pid' entry\n"); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); -- cgit v1.2.3 From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:12:58 -0400 Subject: tracing: adding function timings to function profiler If the function graph trace is enabled, the function profiler will use it to take the timing of the functions. cat /debug/tracing/trace_stat/functions Function Hit Time -------- --- ---- mwait_idle 127 183028.4 us schedule 26 151997.7 us __schedule 31 151975.1 us sys_wait4 2 74080.53 us do_wait 2 74077.80 us sys_newlstat 138 39929.16 us do_path_lookup 179 39845.79 us vfs_lstat_fd 138 39761.97 us user_path_at 153 39469.58 us path_walk 179 39435.76 us __link_path_walk 189 39143.73 us [...] Note the times are skewed due to the function graph tracer not taking into account schedules. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 93 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace.c | 11 ----- kernel/trace/trace.h | 3 +- kernel/trace/trace_functions_graph.c | 17 +++++-- kernel/trace/trace_output.c | 10 ++++ kernel/trace/trace_output.h | 2 + 6 files changed, 117 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 24dac448cdc..a9ccd71fc92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,7 +33,7 @@ #include -#include "trace.h" +#include "trace_output.h" #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ @@ -246,6 +246,9 @@ struct ftrace_profile { struct hlist_node node; unsigned long ip; unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif }; struct ftrace_profile_page { @@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace) return function_stat_next(&profile_pages_start->records[0], 0); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ static int function_stat_cmp(void *p1, void *p2) { struct ftrace_profile *a = p1; @@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2) else return 0; } +#endif static int function_stat_headers(struct seq_file *m) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function Hit Time\n" + " -------- --- ----\n"); +#else seq_printf(m, " Function Hit\n" " -------- ---\n"); +#endif return 0; } @@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); +#endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); return 0; } @@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + unsigned long flags; + struct ftrace_profile *rec; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(trace->func); + if (rec) + rec->time += trace->rettime - trace->calltime; + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static ssize_t ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, goto out; } - register_ftrace_function(&ftrace_profile_ops); + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); + unregister_ftrace_profiler(); } } out: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67c6a21dd42..821bf49771d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d7410bbb9a8..c66ca3b6605 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -605,6 +605,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..85bba0f018b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19261fdd245..a3b6e3fd704 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 35c422fb51a..1eac2973374 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int -- cgit v1.2.3 From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 20:50:39 -0400 Subject: tracing: make the function profiler per cpu Impact: speed enhancement By making the function profiler record in per cpu data we not only get better readings, avoid races, we also do not have to take any locks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a9ccd71fc92..ed1fc5021d4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -257,28 +257,28 @@ struct ftrace_profile_page { struct ftrace_profile records[]; }; +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + #define PROFILE_RECORDS_SIZE \ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -/* TODO: make these percpu, to prevent cache line bouncing */ -static struct ftrace_profile_page *profile_pages_start; -static struct ftrace_profile_page *profile_pages; - -static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); -static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ -static raw_spinlock_t ftrace_profile_rec_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - static void * function_stat_next(void *v, int idx) { @@ -303,7 +303,13 @@ function_stat_next(void *v, int idx) static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&profile_pages_start->records[0], 0); + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; } -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_reset(void) +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; - pg = profile_pages = profile_pages_start; + pg = stat->pages = stat->start; while (pg) { memset(pg->records, 0, PROFILE_RECORDS_SIZE); @@ -395,24 +392,24 @@ static void ftrace_profile_reset(void) pg = pg->next; } - memset(ftrace_profile_hash, 0, + memset(stat->hash, 0, FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(void) +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int i; /* If we already allocated, do nothing */ - if (profile_pages) + if (stat->pages) return 0; - profile_pages = (void *)get_zeroed_page(GFP_KERNEL); - if (!profile_pages) + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) return -ENOMEM; - pg = profile_pages_start = profile_pages; + pg = stat->start = stat->pages; /* allocate 10 more pages to start */ for (i = 0; i < 10; i++) { @@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void) return 0; } -static int ftrace_profile_init(void) +static int ftrace_profile_init_cpu(int cpu) { + struct ftrace_profile_stat *stat; int size; - if (ftrace_profile_hash) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { /* If the profile is already created, simply reset it */ - ftrace_profile_reset(); + ftrace_profile_reset(stat); return 0; } @@ -446,29 +446,45 @@ static int ftrace_profile_init(void) */ size = FTRACE_PROFILE_HASH_SIZE; - ftrace_profile_hash = - kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); - if (!ftrace_profile_hash) + if (!stat->hash) return -ENOMEM; - size--; + if (!ftrace_profile_bits) { + size--; - for (; size; size >>= 1) - ftrace_profile_bits++; + for (; size; size >>= 1) + ftrace_profile_bits++; + } /* Preallocate a few pages */ - if (ftrace_profile_pages_init() < 0) { - kfree(ftrace_profile_hash); - ftrace_profile_hash = NULL; + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; return -ENOMEM; } return 0; } +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + /* interrupts must be disabled */ -static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; @@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) unsigned long key; key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; + hhd = &stat->hash[key]; if (hlist_empty(hhd)) return NULL; @@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) return NULL; } -static void ftrace_add_profile(struct ftrace_profile *rec) +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) { unsigned long key; key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); } /* Interrupts must be disabled calling this */ static struct ftrace_profile * -ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, + unsigned long ip, bool alloc_safe) { struct ftrace_profile *rec = NULL; /* prevent recursion */ - if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + if (atomic_inc_return(&stat->disabled) != 1) goto out; - __raw_spin_lock(&ftrace_profile_rec_lock); - /* Try to always keep another page available */ - if (!profile_pages->next && alloc_safe) - profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + if (!stat->pages->next && alloc_safe) + stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); /* * Try to find the function again since another * task on another CPU could have added it */ - rec = ftrace_find_profiled_func(ip); + rec = ftrace_find_profiled_func(stat, ip); if (rec) - goto out_unlock; + goto out; - if (profile_pages->index == PROFILES_PER_PAGE) { - if (!profile_pages->next) - goto out_unlock; - profile_pages = profile_pages->next; + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; } - rec = &profile_pages->records[profile_pages->index++]; + rec = &stat->pages->records[stat->pages->index++]; rec->ip = ip; - ftrace_add_profile(rec); + ftrace_add_profile(stat, rec); - out_unlock: - __raw_spin_unlock(&ftrace_profile_rec_lock); out: - atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + atomic_dec(&stat->disabled); return rec; } @@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void) static void function_profile_call(unsigned long ip, unsigned long parent_ip) { + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; bool alloc_safe; @@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) alloc_safe = ftrace_safe_to_allocate(); local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip, alloc_safe); if (!rec) goto out; } @@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { - unsigned long flags; + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; + unsigned long flags; local_irq_save(flags); - rec = ftrace_find_profiled_func(trace->func); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) rec->time += trace->rettime - trace->calltime; + out: local_irq_restore(flags); } @@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = { .write = ftrace_profile_write, }; +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + static void ftrace_profile_debugfs(struct dentry *d_tracer) { + struct ftrace_profile_stat *stat; struct dentry *entry; + char *name; int ret; + int cpu; - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } } entry = debugfs_create_file("function_profile_enabled", 0644, -- cgit v1.2.3 From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 23:17:58 -0400 Subject: function-graph: add option to calculate graph time or not graph time is the time that a function is executing another function. Thus if function A calls B, if graph-time is set, then the time for A includes B. This is the default behavior. But if graph-time is off, then the time spent executing B is subtracted from A. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++++++++- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 8 ++++---- 4 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ed1fc5021d4..71e5faef12a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { struct ftrace_profile_stat *stat; + unsigned long long calltime; struct ftrace_profile *rec; unsigned long flags; @@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash) goto out; + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) - rec->time += trace->rettime - trace->calltime; + rec->time += calltime; + out: local_irq_restore(flags); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 821bf49771d..5d1a16cae37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +318,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c66ca3b6605..e3429a8ab05 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -685,6 +685,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 85bba0f018b..10f6ad7d85f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); -- cgit v1.2.3 From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 13:26:41 -0400 Subject: tracing: clean up tracing profiler Ingo Molnar suggested clean ups for the profiling code. This patch makes those updates. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71e5faef12a..a141d8499ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -271,8 +271,10 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits; -static int ftrace_profile_enabled; +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); @@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { - .func = function_profile_call, + .func = function_profile_call, }; static int register_ftrace_profiler(void) @@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; + char buf[64]; /* big enough to hold a number */ int ret; if (cnt >= sizeof(buf)) @@ -719,7 +721,7 @@ static ssize_t ftrace_profile_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64]; + char buf[64]; /* big enough to hold a number */ int r; r = sprintf(buf, "%u\n", ftrace_profile_enabled); @@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = { /* used to initialize the real stat files */ static struct tracer_stat function_stats __initdata = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show }; static void ftrace_profile_debugfs(struct dentry *d_tracer) @@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; -- cgit v1.2.3 From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 20:06:34 -0400 Subject: tracing: remove on the fly allocator from function profiler Impact: safer code The on the fly allocator for the function profiler was to save memory. But at the expense of stability. Although it survived several tests, allocating from the function tracer is just too risky, just to save space. This patch removes the allocator and simply allocates enough entries at start up. Each function gets a profiling structure of 40 bytes. With an average of 20K functions, and this is for each CPU, we have 800K per online CPU. This is not too bad, at least for non-embedded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a141d8499ab..4d90c916b2b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; + int functions; + int pages; int i; /* If we already allocated, do nothing */ @@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) if (!stat->pages) return -ENOMEM; +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + pg = stat->start = stat->pages; - /* allocate 10 more pages to start */ - for (i = 0; i < 10; i++) { + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); - /* - * We only care about allocating profile_pages, if - * we failed to allocate here, hopefully we will allocate - * later. - */ if (!pg->next) - break; + goto out_free; pg = pg->next; } return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; } static int ftrace_profile_init_cpu(int cpu) @@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu) ftrace_profile_bits++; } - /* Preallocate a few pages */ + /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); stat->hash = NULL; @@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, hlist_add_head_rcu(&rec->node, &stat->hash[key]); } -/* Interrupts must be disabled calling this */ +/* + * The memory is already allocated, this simply finds a new record to use. + */ static struct ftrace_profile * -ftrace_profile_alloc(struct ftrace_profile_stat *stat, - unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec = NULL; - /* prevent recursion */ + /* prevent recursion (from NMIs) */ if (atomic_inc_return(&stat->disabled) != 1) goto out; - /* Try to always keep another page available */ - if (!stat->pages->next && alloc_safe) - stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); - /* - * Try to find the function again since another - * task on another CPU could have added it + * Try to find the function again since an NMI + * could have added it */ rec = ftrace_find_profiled_func(stat, ip); if (rec) @@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, return rec; } -/* - * If we are not in an interrupt, or softirq and - * and interrupts are disabled and preemption is not enabled - * (not in a spinlock) then it should be safe to allocate memory. - */ -static bool ftrace_safe_to_allocate(void) -{ - return !in_interrupt() && irqs_disabled() && !preempt_count(); -} - static void function_profile_call(unsigned long ip, unsigned long parent_ip) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; - bool alloc_safe; if (!ftrace_profile_enabled) return; - alloc_safe = ftrace_safe_to_allocate(); - local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); @@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(stat, ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip); if (!rec) goto out; } -- cgit v1.2.3 From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 21:00:47 -0400 Subject: tracing: add average time in function to function profiler Show the average time in the function (Time / Hit) Function Hit Time Avg -------- --- ---- --- mwait_idle 51 140326.6 us 2751.503 us smp_apic_timer_interrupt 47 3517.735 us 74.845 us schedule 10 2738.754 us 273.875 us __schedule 10 2732.857 us 273.285 us hrtimer_interrupt 47 1896.104 us 40.342 us irq_exit 56 1711.833 us 30.568 us __run_hrtimer 47 1315.589 us 27.991 us tick_sched_timer 47 1138.690 us 24.227 us do_softirq 56 1116.829 us 19.943 us __do_softirq 56 1066.932 us 19.052 us do_IRQ 9 926.153 us 102.905 us Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d90c916b2b..c7f4a4be05d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function Hit Time\n" - " -------- --- ----\n"); + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v) struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static struct trace_seq s; static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - trace_seq_init(&s); - trace_print_graph_duration(rec->time, &s); + static struct trace_seq s; + unsigned long long avg; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); trace_print_seq(m, &s); mutex_unlock(&mutex); #endif -- cgit v1.2.3 From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5757e03cfac..7530fdd7c98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(); if (unlikely(!mm)) { next->active_mm = oldmm; -- cgit v1.2.3 From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7530fdd7c98..133762aece5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_start_context_switch(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; -- cgit v1.2.3 From 3d43321b7015387cfebbe26436d0e9d299162ea1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Apr 2009 15:49:29 -0700 Subject: modules: sysctl to block module loading Implement a sysctl file that disables module-loading system-wide since there is no longer a viable way to remove CAP_SYS_MODULE after the system bounding capability set was removed in 2.6.25. Value can only be set to "1", and is tested only if standard capability checks allow CAP_SYS_MODULE. Given existing /dev/mem protections, this should allow administrators a one-way method to block module loading after initial boot-time module loading has finished. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/module.c | 7 +++++-- kernel/sysctl.c | 12 ++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f77ac320d0b..eeb3f7b1383 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -778,6 +778,9 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } +/* Block module loading/unloading? */ +int modules_disabled = 0; + SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -785,7 +788,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, char name[MODULE_NAME_LEN]; int ret, forced = 0; - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) @@ -2349,7 +2352,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, int ret = 0; /* Must have permission */ - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; /* Only one module load at a time, please */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c5ef44ff850..2fb4246d27d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -113,6 +113,7 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_MODULES extern char modprobe_path[]; +extern int modules_disabled; #endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; @@ -533,6 +534,17 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { -- cgit v1.2.3 From 755642322aa66fbc5421a35fd3e1733f73e20083 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:29 +0100 Subject: perf_counter: use list_move_tail() Instead of del/add use a move list-op. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b2e838959f3..0fe22c916e2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -89,8 +89,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, list_entry) { - list_del_init(&sibling->list_entry); - list_add_tail(&sibling->list_entry, &ctx->counter_list); + list_move_tail(&sibling->list_entry, &ctx->counter_list); sibling->group_leader = sibling; } } @@ -959,8 +958,7 @@ static void rotate_ctx(struct perf_counter_context *ctx) */ perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_del(&counter->list_entry); - list_add_tail(&counter->list_entry, &ctx->counter_list); + list_move_tail(&counter->list_entry, &ctx->counter_list); break; } hw_perf_restore(perf_flags); -- cgit v1.2.3 From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:32 +0100 Subject: perf_counter: software counter event infrastructure Provide generic software counter infrastructure that supports software events. This will be used to allow sample based profiling based on software events such as pagefaults. The current infrastructure can only provide a count of such events, no place information. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0fe22c916e2..eeb1b46cf70 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 prev, now; + s64 delta; + +again: + prev = atomic64_read(&hwc->prev_count); + now = atomic64_read(&hwc->count); + if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->irq_period; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_add(period, &hwc->period_left); + } + + atomic64_set(&hwc->prev_count, -left); + atomic64_set(&hwc->count, -left); +} + +static void perf_swcounter_save_and_restart(struct perf_counter *counter) +{ + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); +} + +static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_swcounter_handle_group(struct perf_counter *sibling) +{ + struct perf_counter *counter, *group_leader = sibling->group_leader; + + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + perf_swcounter_update(counter); + perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); + } +} + +static void perf_swcounter_interrupt(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + break; + + case PERF_RECORD_IRQ: + perf_swcounter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_swcounter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else + wake_up(&counter->waitq); +} + +static int perf_swcounter_match(struct perf_counter *counter, + enum hw_event_types event, + struct pt_regs *regs) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + if (counter->hw_event.raw) + return 0; + + if (counter->hw_event.type != event) + return 0; + + if (counter->hw_event.exclude_user && user_mode(regs)) + return 0; + + if (counter->hw_event.exclude_kernel && !user_mode(regs)) + return 0; + + return 1; +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, + enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_counter *counter; + unsigned long flags; + int neg; + + if (list_empty(&ctx->counter_list)) + return; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * XXX: make counter_list RCU safe + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (perf_swcounter_match(counter, event, regs)) { + neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_interrupt(counter, nmi, regs); + } + } + + spin_unlock_irqrestore(&ctx->lock, flags); +} + +void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + + perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); + if (cpuctx->task_ctx) + perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + + put_cpu_var(perf_cpu_context); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ + perf_swcounter_set_period(counter); + return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +/* + * Software counter: cpu wall time clock + */ + static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1364,6 +1543,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; +/* + * Software counter: task time clock + */ + /* * Called from within the scheduler: */ @@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; +/* + * Software counter: page faults + */ + #ifdef CONFIG_VM_EVENT_COUNTERS #define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] #else @@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; +/* + * Software counter: context switches + */ + static u64 get_context_switches(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; +/* + * Software counter: cpu migrations + */ + static inline u64 get_cpu_migrations(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct hw_perf_counter_ops *hw_ops = NULL; + struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter) default: break; } + + if (hw_ops) + hwc->irq_period = hw_event->irq_period; + return hw_ops; } -- cgit v1.2.3 From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:33 +0100 Subject: perf_counter: provide pagefault software events We use the generic software counter infrastructure to provide page fault events. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 53 +++------------------------------------------------ 1 file changed, 3 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index eeb1b46cf70..1773c5d7427 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1607,57 +1607,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { * Software counter: page faults */ -#ifdef CONFIG_VM_EVENT_COUNTERS -#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] -#else -#define cpu_page_faults() 0 -#endif - -static u64 get_page_faults(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->maj_flt + curr->min_flt; - return cpu_page_faults(); -} - -static void page_faults_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_page_faults(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void page_faults_perf_counter_read(struct perf_counter *counter) -{ - page_faults_perf_counter_update(counter); -} - -static int page_faults_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); - return 0; -} - -static void page_faults_perf_counter_disable(struct perf_counter *counter) -{ - page_faults_perf_counter_update(counter); -} - static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = page_faults_perf_counter_enable, - .disable = page_faults_perf_counter_disable, - .read = page_faults_perf_counter_read, + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, }; /* -- cgit v1.2.3 From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1773c5d7427..68950a3a52b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } +static const struct hw_perf_counter_ops perf_ops_generic = { + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, +}; + /* * Software counter: cpu wall time clock */ @@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: page faults - */ - -static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, -}; - /* * Software counter: context switches */ @@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel)) - hw_ops = &perf_ops_page_faults; + case PERF_COUNT_PAGE_FAULTS_MIN: + case PERF_COUNT_PAGE_FAULTS_MAJ: + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CONTEXT_SWITCHES: if (!counter->hw_event.exclude_kernel) -- cgit v1.2.3 From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:35 +0100 Subject: perf_counter: hrtimer based sampling for software time events Use hrtimers to profile timer based sampling for the software time counters. This allows platforms without hardware counter support to still perform sample based profiling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 123 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68950a3a52b..f9330d5827c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) struct perf_counter *counter, *group_leader = sibling->group_leader; list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - perf_swcounter_update(counter); + counter->hw_ops->read(counter); perf_swcounter_store_irq(sibling, counter->hw_event.type); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } @@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) static void perf_swcounter_interrupt(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); - switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: break; @@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter, wake_up(&counter->waitq); } +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + struct perf_counter *counter; + struct pt_regs *regs; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->hw_ops->read(counter); + + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->hw_event.exclude_kernel || !regs) && + !counter->hw_event.exclude_user) + regs = task_pt_regs(current); + + if (regs) + perf_swcounter_interrupt(counter, 0, regs); + + hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + + return HRTIMER_RESTART; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + perf_swcounter_interrupt(counter, nmi, regs); +} + static int perf_swcounter_match(struct perf_counter *counter, enum hw_event_types event, struct pt_regs *regs) @@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct pt_regs *regs) +{ + int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_overflow(counter, nmi, regs); +} + static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum hw_event_types event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; unsigned long flags; - int neg; if (list_empty(&ctx->counter_list)) return; @@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, * XXX: make counter_list RCU safe */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (perf_swcounter_match(counter, event, regs)) { - neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) - perf_swcounter_interrupt(counter, nmi, regs); - } + if (perf_swcounter_match(counter, event, regs)) + perf_swcounter_add(counter, nr, nmi, regs); } spin_unlock_irqrestore(&ctx->lock, flags); @@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = { * Software counter: cpu wall time clock */ -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); - return 0; -} - static void cpu_clock_perf_counter_update(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter) atomic64_add(now - prev, &counter->count); } +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now atomic64_add(delta, &counter->count); } -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 now = task_clock_perf_counter_val(counter, 1); - - task_clock_perf_counter_update(counter, now); -} - static int task_clock_perf_counter_enable(struct perf_counter *counter) { - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - task_clock_perf_counter_val(counter, 0)); + struct hw_perf_counter *hwc = &counter->hw; + + atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); + hrtimer_cancel(&counter->hw.hrtimer); + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 0)); +} - task_clock_perf_counter_update(counter, now); +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 1)); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv)) - hw_ops = &perf_ops_cpu_clock; + hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: - if (counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv) - break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_task_clock; else hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: -- cgit v1.2.3 From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:36 +0100 Subject: perf_counter: add an event_list I noticed that the counter_list only includes top-level counters, thus perf_swcounter_event() will miss sw-counters in groups. Since perf_swcounter_event() also wants an RCU safe list, create a new event_list that includes all counters and uses RCU list ops and use call_rcu to free the counter structure. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f9330d5827c..8d6ecfa64c0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Each CPU has a list of per CPU counters: @@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_tail(&counter->list_entry, &ctx->counter_list); else list_add_tail(&counter->list_entry, &group_leader->sibling_list); + + list_add_rcu(&counter->event_entry, &ctx->event_list); } static void @@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) struct perf_counter *sibling, *tmp; list_del_init(&counter->list_entry); + list_del_rcu(&counter->event_entry); /* * If this was a group counter with sibling counters then @@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ctx; } +static void free_counter_rcu(struct rcu_head *head) +{ + struct perf_counter *counter; + + counter = container_of(head, struct perf_counter, rcu_head); + kfree(counter); +} + /* * Called when the last reference to the file is gone. */ @@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - kfree(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); put_context(ctx); return 0; @@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, int nmi, struct pt_regs *regs) { struct perf_counter *counter; - unsigned long flags; - if (list_empty(&ctx->counter_list)) + if (list_empty(&ctx->event_list)) return; - spin_lock_irqsave(&ctx->lock, flags); - - /* - * XXX: make counter_list RCU safe - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } - - spin_unlock_irqrestore(&ctx->lock, flags); + rcu_read_unlock(); } void perf_swcounter_event(enum hw_event_types event, u64 nr, @@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mutex); INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); @@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); ctx->task = task; } -- cgit v1.2.3 From 039fc91e064b81c2820ff16c304be5aba35fd126 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 16:43:47 +0100 Subject: perf_counter: fix hrtimer sampling Impact: fix deadlock with perfstat Fix for the perfstat fubar.. We cannot unconditionally call hrtimer_cancel() without ever having done hrtimer_init() on the thing. Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <1236959027.22447.149.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d6ecfa64c0..d6cc22271ef 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1571,9 +1571,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(hwc->irq_period), 0, HRTIMER_MODE_REL, 0); @@ -1635,9 +1635,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(hwc->irq_period), 0, HRTIMER_MODE_REL, 0); -- cgit v1.2.3 From 4e193bd4dfdc983d12969b51439b4a1fbaf2daad Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Sat, 14 Mar 2009 14:29:25 +0100 Subject: perf_counter: include missing header Impact: build fix In order to compile a kernel with performance counter patches, has to be included to provide the declaration of struct pt_regs *get_irq_regs(void); [ This bug was masked by unrelated x86 header file changes in the x86 tree, but occurs in the tip:perfcounters/core standalone tree. ] Signed-off-by: Tim Blechmann Orig-LKML-Reference: <20090314142925.49c29c17@thinkpad> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d6cc22271ef..0018c5e8124 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -24,6 +24,8 @@ #include #include +#include + /* * Each CPU has a list of per CPU counters: */ -- cgit v1.2.3 From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 16 Mar 2009 21:00:00 +1100 Subject: perf_counter: abstract wakeup flag setting in core to fix powerpc build Impact: build fix for powerpc Commit bd753921015e7905 ("perf_counter: software counter event infrastructure") introduced a use of TIF_PERF_COUNTERS into the core perfcounter code. This breaks the build on powerpc because we use a flag in a per-cpu area to signal wakeups on powerpc rather than a thread_info flag, because the thread_info flags have to be manipulated with atomic operations and are thus slower than per-cpu flags. This fixes the by changing the core to use an abstracted set_perf_counter_pending() function, which is defined on x86 to set the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag (paca->perf_counter_pending). It changes the previous powerpc definition of set_perf_counter_pending to not take an argument and adds a clear_perf_counter_pending, so as to simplify the definition on x86. On x86, set_perf_counter_pending() is defined as a macro. Defining it as a static inline in arch/x86/include/asm/perf_counters.h causes compile failures because gets included early in , and the definitions of set_tsk_thread_flag etc. are therefore not available in . (On powerpc this problem is avoided by defining set_perf_counter_pending etc. in .) Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0018c5e8124..b39456ad74a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1433,7 +1433,7 @@ static void perf_swcounter_interrupt(struct perf_counter *counter, if (nmi) { counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + set_perf_counter_pending(); } else wake_up(&counter->waitq); } -- cgit v1.2.3 From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:11 +0100 Subject: perf_counter: fix uninitialized usage of event_list Impact: fix boot crash When doing the generic context switch event I ran into some early boot hangs, which were caused by inf func recursion (event, fault, event, fault). I eventually tracked it down to event_list not being initialized at the time of the first event. Fix this. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.195392657@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b39456ad74a..4c4e9eb37ab 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, { struct perf_counter *counter; - if (list_empty(&ctx->event_list)) + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); -- cgit v1.2.3 From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 60 ++++----------------------------------------------- kernel/sched.c | 6 ------ 2 files changed, 4 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c4e9eb37ab..99d5930f0a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct pt_regs *regs; if (likely(!cpuctx->task_ctx)) return; + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1667,58 +1670,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: context switches - */ - -static u64 get_context_switches(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->nvcsw + curr->nivcsw; - return cpu_nr_switches(smp_processor_id()); -} - -static void context_switches_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void context_switches_perf_counter_read(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static int context_switches_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_context_switches(counter)); - return 0; -} - -static void context_switches_perf_counter_disable(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static const struct hw_perf_counter_ops perf_ops_context_switches = { - .enable = context_switches_perf_counter_enable, - .disable = context_switches_perf_counter_disable, - .read = context_switches_perf_counter_read, -}; - /* * Software counter: cpu migrations */ @@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: - hw_ops = &perf_ops_generic; - break; case PERF_COUNT_CONTEXT_SWITCHES: - if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_context_switches; + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) diff --git a/kernel/sched.c b/kernel/sched.c index 39e70860216..f76e3c0188a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2900,14 +2900,8 @@ unsigned long nr_active(void) /* * Externally visible per-cpu scheduler statistics: - * cpu_nr_switches(cpu) - number of context switches on that cpu * cpu_nr_migrations(cpu) - number of migrations into that cpu */ -u64 cpu_nr_switches(int cpu) -{ - return cpu_rq(cpu)->nr_switches; -} - u64 cpu_nr_migrations(int cpu) { return cpu_rq(cpu)->nr_migrations_in; -- cgit v1.2.3 From f16009527595ee562308653bc3d0039166d2ab15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:16 +0100 Subject: perf_counter: fix up counter free paths Impact: fix crash during perfcounters use I found another counter free path, create a free_counter() call to accomodate generic tear-down. Fixes an RCU bug. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.652078652@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 99d5930f0a5..97f891ffeb4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1150,6 +1150,11 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void free_counter(struct perf_counter *counter) +{ + call_rcu(&counter->rcu_head, free_counter_rcu); +} + /* * Called when the last reference to the file is gone. */ @@ -1168,7 +1173,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - call_rcu(&counter->rcu_head, free_counter_rcu); + free_counter(counter); put_context(ctx); return 0; @@ -2128,10 +2133,10 @@ __perf_counter_exit_task(struct task_struct *child, list_entry) { if (sub->parent) { sync_child_counter(sub, sub->parent); - kfree(sub); + free_counter(sub); } } - kfree(child_counter); + free_counter(child_counter); } } -- cgit v1.2.3 From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:17 +0100 Subject: perf_counter: hook up the tracepoint events Impact: new perfcounters feature Enable usage of tracepoints as perf counter events. tracepoint event ids can be found in /debug/tracing/event/*/*/id and (for now) are represented as -65536+id in the type field. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.744044174@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 97f891ffeb4..0bbe3e45ba0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head) static void free_counter(struct perf_counter *counter) { + if (counter->destroy) + counter->destroy(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { .read = cpu_migrations_perf_counter_read, }; +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ + perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, + task_pt_regs(current)); +} + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + + ftrace_profile_disable(event_id); +} + +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int ret; + + ret = ftrace_profile_enable(event_id); + if (ret) + return NULL; + + counter->destroy = tp_perf_counter_destroy; + + return &perf_ops_generic; +} +#else +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} +#endif + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_migrations; break; default: + hw_ops = tp_perf_counter_init(counter); break; } -- cgit v1.2.3 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 83 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0bbe3e45ba0..68a56a68bc7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_save_and_restart(struct perf_counter *counter) -{ - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); -} - static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) { struct perf_data *irqdata = counter->irqdata; @@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, counter->hw_event.event_config); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } } @@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); perf_swcounter_interrupt(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, - enum hw_event_types event, - struct pt_regs *regs) + enum perf_event_types type, + u32 event, struct pt_regs *regs) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw) + if (counter->hw_event.raw_type) + return 0; + + if (counter->hw_event.type != type) return 0; - if (counter->hw_event.type != event) + if (counter->hw_event.event_id != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) + enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; @@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, event, regs)) + if (perf_swcounter_match(counter, type, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } rcu_read_unlock(); } -void perf_swcounter_event(enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) +static void __perf_swcounter_event(enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); - if (cpuctx->task_ctx) - perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + if (cpuctx->task_ctx) { + perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, + nr, nmi, regs); + } put_cpu_var(perf_cpu_context); } +void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +{ + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); +} + static void perf_swcounter_read(struct perf_counter *counter) { perf_swcounter_update(counter); @@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { - perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, - task_pt_regs(current)); + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); } extern int ftrace_profile_enable(int); @@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; - - ftrace_profile_disable(event_id); + ftrace_profile_disable(counter->hw_event.event_id); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int event_id = counter->hw_event.event_id; int ret; ret = ftrace_profile_enable(event_id); @@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; + counter->hw.irq_period = counter->hw_event.irq_period; return &perf_ops_generic; } @@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.type) { + switch (counter->hw_event.event_id) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter) if (!counter->hw_event.exclude_kernel) hw_ops = &perf_ops_cpu_migrations; break; - default: - hw_ops = tp_perf_counter_init(counter); - break; } if (hw_ops) @@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->state = PERF_COUNTER_STATE_OFF; hw_ops = NULL; - if (!hw_event->raw && hw_event->type < 0) - hw_ops = sw_perf_counter_init(counter); - else + + if (hw_event->raw_type) + hw_ops = hw_perf_counter_init(counter); + else switch (hw_event->type) { + case PERF_TYPE_HARDWARE: hw_ops = hw_perf_counter_init(counter); + break; + + case PERF_TYPE_SOFTWARE: + hw_ops = sw_perf_counter_init(counter); + break; + + case PERF_TYPE_TRACEPOINT: + hw_ops = tp_perf_counter_init(counter); + break; + } if (!hw_ops) { kfree(counter); -- cgit v1.2.3 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 106 ++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68a56a68bc7..f054b8c9bf9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Output + */ + +static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_counter_handle_group(struct perf_counter *counter) +{ + struct perf_counter *leader, *sub; + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, atomic64_read(&sub->count)); + } +} + +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + return; + + case PERF_RECORD_IRQ: + perf_counter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_counter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); +} + /* * Generic software counter infrastructure */ @@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -static void perf_swcounter_handle_group(struct perf_counter *sibling) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.event_config); - perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); - } -} - -static void perf_swcounter_interrupt(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - - case PERF_RECORD_IRQ: - perf_swcounter_store_irq(counter, instruction_pointer(regs)); - break; - - case PERF_RECORD_GROUP: - perf_swcounter_handle_group(counter); - break; - } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); -} - static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { struct perf_counter *counter; @@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) - perf_swcounter_interrupt(counter, 0, regs); + perf_counter_output(counter, 0, regs); hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); @@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_swcounter_interrupt(counter, nmi, regs); + perf_counter_output(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f054b8c9bf9..ca14fc41ccd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, sub->hw_event.config); perf_counter_store_irq(counter, atomic64_read(&sub->count)); } } @@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw_type) + if (perf_event_raw(&counter->hw_event)) return 0; - if (counter->hw_event.type != type) + if (perf_event_type(&counter->hw_event) != type) return 0; - if (counter->hw_event.event_id != event) + if (perf_event_id(&counter->hw_event) != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(counter->hw_event.event_id); + ftrace_profile_disable(perf_event_id(&counter->hw_event)); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.event_id; + int event_id = perf_event_id(&counter->hw_event); int ret; ret = ftrace_profile_enable(event_id); @@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.event_id) { + switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; - if (hw_event->raw_type) + if (perf_event_raw(hw_event)) { hw_ops = hw_perf_counter_init(counter); - else switch (hw_event->type) { + goto done; + } + + switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: hw_ops = hw_perf_counter_init(counter); break; @@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, kfree(counter); return NULL; } +done: counter->hw_ops = hw_ops; return counter; -- cgit v1.2.3 From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:07 +0100 Subject: perf_counter: avoid recursion Tracepoint events like lock_acquire and software counters like pagefaults can recurse into the perf counter code again, avoid that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.152096433@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ca14fc41ccd..ce34bff07bd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_unlock(); } +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + static void __perf_swcounter_event(enum perf_event_types type, u32 event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swcounter_recursion_context(cpuctx); + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); if (cpuctx->task_ctx) { @@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, nr, nmi, regs); } + barrier(); + (*recursion)--; + +out: put_cpu_var(perf_cpu_context); } -- cgit v1.2.3 From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 18:22:08 +0100 Subject: perf_counter: add an mmap method to allow userspace to read hardware counters Impact: new feature giving performance improvement This adds the ability for userspace to do an mmap on a hardware counter fd and get access to a read-only page that contains the information needed to translate a hardware counter value to the full 64-bit counter value that would be returned by a read on the fd. This is useful on architectures that allow user programs to read the hardware counters, such as PowerPC. The mmap will only succeed if the counter is a hardware counter monitoring the current process. On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter and translate it to the full 64-bit value in about 30ns using the mmapped page, compared to about 830ns for the read syscall on the counter, so this does give a significant performance improvement. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.297057964@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ce34bff07bd..d9cfd902140 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); + free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_counter_mmap_page *userpg; + + if (!counter->user_page) + return; + userpg = (struct perf_counter_mmap_page *) counter->user_page; + + ++userpg->lock; + smp_wmb(); + userpg->index = counter->hw.idx; + userpg->offset = atomic64_read(&counter->count); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + userpg->offset -= atomic64_read(&counter->hw.prev_count); + smp_wmb(); + ++userpg->lock; +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (!counter->user_page) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(counter->user_page); + get_page(vmf->page); + return 0; +} + +static struct vm_operations_struct perf_mmap_vmops = { + .fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_counter *counter = file->private_data; + unsigned long userpg; + + if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + return -EINVAL; + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + /* + * For now, restrict to the case of a hardware counter + * on the current task. + */ + if (is_software_counter(counter) || counter->task != current) + return -EINVAL; + + userpg = counter->user_page; + if (!userpg) { + userpg = get_zeroed_page(GFP_KERNEL); + mutex_lock(&counter->mutex); + if (counter->user_page) { + free_page(userpg); + userpg = counter->user_page; + } else { + counter->user_page = userpg; + } + mutex_unlock(&counter->mutex); + if (!userpg) + return -ENOMEM; + } + + perf_counter_update_userpage(counter); + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, }; /* -- cgit v1.2.3 From b09d2501ed3d294619cbfbcf828ad39324d0e548 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 17:21:56 -0700 Subject: mutex: drop "inline" from mutex_lock() inside kernel/mutex.c Impact: build fix mutex_lock() is was defined inline in kernel/mutex.c, but wasn't declared so not in . This didn't cause a problem until checkin 3a2d367d9aabac486ac4444c6c7ec7a1dab16267 added the atomic_dec_and_mutex_lock() inline in between declaration and definion. This broke building with CONFIG_ALLOW_WARNINGS=n, e.g. make allnoconfig. Either from the source code nor the allnoconfig binary output I cannot find any internal references to mutex_lock() in kernel/mutex.c, so presumably this "inline" is now-useless legacy. Cc: Eric Paris Cc: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: Signed-off-by: H. Peter Anvin --- kernel/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a..fd95eaa672e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) down(). */ -void inline __sched mutex_lock(struct mutex *lock) +void __sched mutex_lock(struct mutex *lock) { might_sleep(); /* -- cgit v1.2.3 From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:10 +0100 Subject: perf_counter: new output ABI - part 1 Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 464 ++++++++++++++++++++++++++------------------------ 1 file changed, 245 insertions(+), 219 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d9cfd902140..0dfe91094fd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4,7 +4,8 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * - * For licencing details see kernel-base/COPYING + * + * For licensing details see kernel-base/COPYING */ #include @@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } -/* - * Cross CPU call to switch performance data pointers - */ -static void __perf_switch_irq_data(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task) { - if (cpuctx->task_ctx != ctx) - return; - spin_lock(&ctx->lock); - } - - /* Change the pointer NMI safe */ - atomic_long_set((atomic_long_t *)&counter->irqdata, - (unsigned long) counter->usrdata); - counter->usrdata = oldirqdata; - - if (ctx->task) - spin_unlock(&ctx->lock); -} - -static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - struct task_struct *task = ctx->task; - - if (!task) { - smp_call_function_single(counter->cpu, - __perf_switch_irq_data, - counter, 1); - return counter->usrdata; - } - -retry: - spin_lock_irq(&ctx->lock); - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - counter->irqdata = counter->usrdata; - counter->usrdata = oldirqdata; - spin_unlock_irq(&ctx->lock); - return oldirqdata; - } - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_switch_irq_data, counter); - /* Might have failed, because task was scheduled out */ - if (counter->irqdata == oldirqdata) - goto retry; - - return counter->usrdata; -} - static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { u64 cntval; - if (count != sizeof(cntval)) + if (count < sizeof(cntval)) return -EINVAL; /* @@ -1210,122 +1150,21 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); } -static ssize_t -perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) -{ - if (!usrdata->len) - return 0; - - count = min(count, (size_t)usrdata->len); - if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) - return -EFAULT; - - /* Adjust the counters */ - usrdata->len -= count; - if (!usrdata->len) - usrdata->rd_idx = 0; - else - usrdata->rd_idx += count; - - return count; -} - -static ssize_t -perf_read_irq_data(struct perf_counter *counter, - char __user *buf, - size_t count, - int nonblocking) -{ - struct perf_data *irqdata, *usrdata; - DECLARE_WAITQUEUE(wait, current); - ssize_t res, res2; - - irqdata = counter->irqdata; - usrdata = counter->usrdata; - - if (usrdata->len + irqdata->len >= count) - goto read_pending; - - if (nonblocking) - return -EAGAIN; - - spin_lock_irq(&counter->waitq.lock); - __add_wait_queue(&counter->waitq, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (usrdata->len + irqdata->len >= count) - break; - - if (signal_pending(current)) - break; - - if (counter->state == PERF_COUNTER_STATE_ERROR) - break; - - spin_unlock_irq(&counter->waitq.lock); - schedule(); - spin_lock_irq(&counter->waitq.lock); - } - __remove_wait_queue(&counter->waitq, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&counter->waitq.lock); - - if (usrdata->len + irqdata->len < count && - counter->state != PERF_COUNTER_STATE_ERROR) - return -ERESTARTSYS; -read_pending: - mutex_lock(&counter->mutex); - - /* Drain pending data first: */ - res = perf_copy_usrdata(usrdata, buf, count); - if (res < 0 || res == count) - goto out; - - /* Switch irq buffer: */ - usrdata = perf_switch_irq_data(counter); - res2 = perf_copy_usrdata(usrdata, buf + res, count - res); - if (res2 < 0) { - if (!res) - res = -EFAULT; - } else { - res += res2; - } -out: - mutex_unlock(&counter->mutex); - - return res; -} - static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return perf_read_hw(counter, buf, count); - - case PERF_RECORD_IRQ: - case PERF_RECORD_GROUP: - return perf_read_irq_data(counter, buf, count, - file->f_flags & O_NONBLOCK); - } - return -EINVAL; + return perf_read_hw(counter, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = 0; - unsigned long flags; + unsigned int events = POLLIN; poll_wait(file, &counter->waitq, wait); - spin_lock_irqsave(&counter->waitq.lock, flags); - if (counter->usrdata->len || counter->irqdata->len) - events |= POLLIN; - spin_unlock_irqrestore(&counter->waitq.lock, flags); - return events; } @@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -void perf_counter_update_userpage(struct perf_counter *counter) +static void __perf_counter_update_userpage(struct perf_counter *counter, + struct perf_mmap_data *data) { - struct perf_counter_mmap_page *userpg; - - if (!counter->user_page) - return; - userpg = (struct perf_counter_mmap_page *) counter->user_page; + struct perf_counter_mmap_page *userpg = data->user_page; + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); ++userpg->lock; smp_wmb(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + + userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; + preempt_enable(); +} + +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + __perf_counter_update_userpage(counter, data); + rcu_read_unlock(); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_counter *counter = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; - if (!counter->user_page) - return VM_FAULT_SIGBUS; + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; - vmf->page = virt_to_page(counter->user_page); + if ((unsigned)nr > data->nr_pages) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } get_page(vmf->page); + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&counter->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + + rcu_assign_pointer(counter->data, data); + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data = container_of(rcu_head, + struct perf_mmap_data, rcu_head); + int i; + + free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ + struct perf_mmap_data *data = counter->data; + + WARN_ON(atomic_read(&counter->mmap_count)); + + rcu_assign_pointer(counter->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&counter->mmap_count, + &counter->mmap_mutex)) { + perf_mmap_data_free(counter); + mutex_unlock(&counter->mmap_mutex); + } } static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, .fault = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; - unsigned long userpg; + unsigned long vma_size; + unsigned long nr_pages; + unsigned long locked, lock_limit; + int ret = 0; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (nr_pages == 0 || !is_power_of_2(nr_pages)) return -EINVAL; - /* - * For now, restrict to the case of a hardware counter - * on the current task. - */ - if (is_software_counter(counter) || counter->task != current) + if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - userpg = counter->user_page; - if (!userpg) { - userpg = get_zeroed_page(GFP_KERNEL); - mutex_lock(&counter->mutex); - if (counter->user_page) { - free_page(userpg); - userpg = counter->user_page; - } else { - counter->user_page = userpg; - } - mutex_unlock(&counter->mutex); - if (!userpg) - return -ENOMEM; - } + if (vma->vm_pgoff != 0) + return -EINVAL; + + locked = vma_size >> PAGE_SHIFT; + locked += vma->vm_mm->locked_vm; - perf_counter_update_userpage(counter); + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) + return -EPERM; + + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) + goto out; + + WARN_ON(counter->data); + ret = perf_mmap_data_alloc(counter, nr_pages); + if (!ret) + atomic_set(&counter->mmap_count, 1); +out: + mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; - return 0; + + return ret; } static const struct file_operations perf_fops = { @@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = { * Output */ -static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +static int perf_output_write(struct perf_counter *counter, int nmi, + void *buf, ssize_t size) { - struct perf_data *irqdata = counter->irqdata; + struct perf_mmap_data *data; + unsigned int offset, head, nr; + unsigned int len; + int ret, wakeup; - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; + rcu_read_lock(); + ret = -ENOSPC; + data = rcu_dereference(counter->data); + if (!data) + goto out; + + if (!data->nr_pages) + goto out; + + ret = -EINVAL; + if (size > PAGE_SIZE) + goto out; + + do { + offset = head = atomic_read(&data->head); + head += sizeof(u64); + } while (atomic_cmpxchg(&data->head, offset, head) != offset); + + wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); - *p = data; - irqdata->len += sizeof(u64); + nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); + offset &= PAGE_SIZE - 1; + + len = min_t(unsigned int, PAGE_SIZE - offset, size); + memcpy(data->data_pages[nr] + offset, buf, len); + size -= len; + + if (size) { + nr = (nr + 1) & (data->nr_pages - 1); + memcpy(data->data_pages[nr], buf + len, size); + } + + /* + * generate a poll() wakeup for every page boundary crossed + */ + if (wakeup) { + __perf_counter_update_userpage(counter, data); + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); } + ret = 0; +out: + rcu_read_unlock(); + + return ret; } -static void perf_counter_handle_group(struct perf_counter *counter) +static void perf_output_simple(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + u64 entry; + + entry = instruction_pointer(regs); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); +} + +struct group_entry { + u64 event; + u64 counter; +}; + +static void perf_output_group(struct perf_counter *counter, int nmi) { struct perf_counter *leader, *sub; leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { + struct group_entry entry; + if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.config); - perf_counter_store_irq(counter, atomic64_read(&sub->count)); + + entry.event = sub->hw_event.config; + entry.counter = atomic64_read(&sub->count); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); } } @@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter, return; case PERF_RECORD_IRQ: - perf_counter_store_irq(counter, instruction_pointer(regs)); + perf_output_simple(counter, nmi, regs); break; case PERF_RECORD_GROUP: - perf_counter_handle_group(counter); + perf_output_group(counter, nmi); break; } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); } /* @@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + mutex_init(&counter->mmap_mutex); + INIT_LIST_HEAD(&counter->child_list); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; -- cgit v1.2.3 From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Mar 2009 13:18:16 +0100 Subject: perf_counter: fix perf_poll() Impact: fix kerneltop 100% CPU usage Only return a poll event when there's actually been one, poll_wait() doesn't actually wait for the waitq you pass it, it only enqueues you on it. Only once all FDs have been iterated and none of thm returned a poll-event will it schedule(). Also make it return POLL_HUP when there's not mmap() area to read from. Further, fix a silly bug in the write code. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arjan van de Ven Orig-LKML-Reference: <1237897096.24918.181.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0dfe91094fd..affe227d56a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = POLLIN; + struct perf_mmap_data *data; + unsigned int events; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + events = atomic_xchg(&data->wakeup, 0); + else + events = POLL_HUP; + rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, do { offset = head = atomic_read(&data->head); - head += sizeof(u64); + head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, * generate a poll() wakeup for every page boundary crossed */ if (wakeup) { + atomic_xchg(&data->wakeup, POLL_IN); __perf_counter_update_userpage(counter, data); if (nmi) { counter->wakeup_pending = 1; -- cgit v1.2.3 From b9cacc7bf193df16532bfa7d7ca77fe50fc3c2e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:22 +0100 Subject: perf_counter: more elaborate write API Provide a begin, copy, end interface to the output buffer. begin() reserves the space, copy() copies the data over, considering page boundaries, end() finalizes the event and does the wakeup. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.740550870@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 109 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index affe227d56a..0422fd9bf62 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -16,15 +17,14 @@ #include #include #include +#include +#include +#include #include #include #include #include #include -#include -#include -#include -#include #include @@ -1411,16 +1411,20 @@ static const struct file_operations perf_fops = { * Output */ -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned int offset; + int wakeup; +}; + +static int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size) { struct perf_mmap_data *data; - unsigned int offset, head, nr; - unsigned int len; - int ret, wakeup; + unsigned int offset, head; rcu_read_lock(); - ret = -ENOSPC; data = rcu_dereference(counter->data); if (!data) goto out; @@ -1428,45 +1432,82 @@ static int perf_output_write(struct perf_counter *counter, int nmi, if (!data->nr_pages) goto out; - ret = -EINVAL; - if (size > PAGE_SIZE) - goto out; - do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + handle->counter = counter; + handle->data = data; + handle->offset = offset; + handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); - nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); - offset &= PAGE_SIZE - 1; + return 0; - len = min_t(unsigned int, PAGE_SIZE - offset, size); - memcpy(data->data_pages[nr] + offset, buf, len); - size -= len; +out: + rcu_read_unlock(); - if (size) { - nr = (nr + 1) & (data->nr_pages - 1); - memcpy(data->data_pages[nr], buf + len, size); - } + return -ENOSPC; +} - /* - * generate a poll() wakeup for every page boundary crossed - */ - if (wakeup) { - atomic_xchg(&data->wakeup, POLL_IN); - __perf_counter_update_userpage(counter, data); +static void perf_output_copy(struct perf_output_handle *handle, + void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; +} + +static void perf_output_end(struct perf_output_handle *handle, int nmi) +{ + if (handle->wakeup) { + (void)atomic_xchg(&handle->data->wakeup, POLL_IN); + __perf_counter_update_userpage(handle->counter, handle->data); if (nmi) { - counter->wakeup_pending = 1; + handle->counter->wakeup_pending = 1; set_perf_counter_pending(); } else - wake_up(&counter->waitq); + wake_up(&handle->counter->waitq); } - ret = 0; -out: rcu_read_unlock(); +} + +static int perf_output_write(struct perf_counter *counter, int nmi, + void *buf, ssize_t size) +{ + struct perf_output_handle handle; + int ret; + ret = perf_output_begin(&handle, counter, size); + if (ret) + goto out; + + perf_output_copy(&handle, buf, size); + perf_output_end(&handle, nmi); + +out: return ret; } -- cgit v1.2.3 From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:23 +0100 Subject: perf_counter: output objects Provide a {type,size} header for each output entry. This should provide extensible output, and the ability to mix multiple streams. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.831607932@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 53 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0422fd9bf62..d76e3112d38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) */ if (counter->group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); - else + else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } list_add_rcu(&counter->event_entry, &ctx->event_list); } @@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); + if (counter->group_leader != counter) + counter->group_leader->nr_siblings--; + /* * If this was a group counter with sibling counters then * upgrade the siblings to singleton counters by adding them @@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader) if (!is_software_counter(leader)) return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) if (!is_software_counter(counter)) return 0; + return 1; } @@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; } +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { @@ -1514,34 +1524,53 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - u64 entry; + struct { + struct perf_event_header header; + u64 ip; + } event; - entry = instruction_pointer(regs); + event.header.type = PERF_EVENT_IP; + event.header.size = sizeof(event); + event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_write(counter, nmi, &event, sizeof(event)); } -struct group_entry { - u64 event; - u64 counter; -}; - static void perf_output_group(struct perf_counter *counter, int nmi) { + struct perf_output_handle handle; + struct perf_event_header header; struct perf_counter *leader, *sub; + unsigned int size; + struct { + u64 event; + u64 counter; + } entry; + int ret; + + size = sizeof(header) + counter->nr_siblings * sizeof(entry); + + ret = perf_output_begin(&handle, counter, size); + if (ret) + return; + + header.type = PERF_EVENT_GROUP; + header.size = size; + + perf_output_put(&handle, header); leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { - struct group_entry entry; - if (sub != counter) sub->hw_ops->read(sub); entry.event = sub->hw_event.config; entry.counter = atomic64_read(&sub->count); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_put(&handle, entry); } + + perf_output_end(&handle, nmi); } void perf_counter_output(struct perf_counter *counter, -- cgit v1.2.3 From 63e35b25d6b5c3136d22ef249dbbf96716aa08bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:24 +0100 Subject: perf_counter: sanity check on the output API Ensure we never write more than we said we would. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.921433024@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d76e3112d38..7669afe82cc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1422,6 +1422,7 @@ struct perf_output_handle { struct perf_counter *counter; struct perf_mmap_data *data; unsigned int offset; + unsigned int head; int wakeup; }; @@ -1447,6 +1448,7 @@ static int perf_output_begin(struct perf_output_handle *handle, handle->counter = counter; handle->data = data; handle->offset = offset; + handle->head = head; handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); return 0; @@ -1485,6 +1487,8 @@ static void perf_output_copy(struct perf_output_handle *handle, } while (len); handle->offset = offset; + + WARN_ON_ONCE(handle->offset > handle->head); } #define perf_output_put(handle, x) \ -- cgit v1.2.3 From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:25 +0100 Subject: perf_counter: optionally provide the pid/tid of the sampled task Allow cpu wide counters to profile userspace by providing what process the sample belongs to. This raises the first issue with the output type, lots of these options: group, tid, callchain, etc.. are non-exclusive and could be combined, suggesting a bitfield. However, things like the mmap() data stream doesn't fit in that. How to split the type field... Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.013775235@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7669afe82cc..f3e1b27bc1b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1528,16 +1528,30 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + unsigned int size; struct { struct perf_event_header header; u64 ip; + u32 pid, tid; } event; event.header.type = PERF_EVENT_IP; - event.header.size = sizeof(event); event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &event, sizeof(event)); + size = sizeof(event); + + if (counter->hw_event.include_tid) { + /* namespace issues */ + event.pid = current->group_leader->pid; + event.tid = current->pid; + + event.header.type |= __PERF_EVENT_TID; + } else + size -= sizeof(u64); + + event.header.size = size; + + perf_output_write(counter, nmi, &event, size); } static void perf_output_group(struct perf_counter *counter, int nmi) -- cgit v1.2.3 From 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:48:31 +0100 Subject: perf_counter: allow and require one-page mmap on counting counters A brainfart stopped single page mmap()s working. The rest of the code should be perfectly fine with not having any data pages. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <1237981712.7972.812.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f3e1b27bc1b..95e02575546 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1369,7 +1369,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) vma_size = vma->vm_end - vma->vm_start; nr_pages = (vma_size / PAGE_SIZE) - 1; - if (nr_pages == 0 || !is_power_of_2(nr_pages)) + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) -- cgit v1.2.3 From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Mar 2009 22:46:58 +1100 Subject: perf_counter: record time running and time enabled for each counter Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Andrew Morton Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 157 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 136 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95e02575546..3b862a7988c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_stopped = ctx->time_now; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -251,6 +252,60 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Get the current time for this context. + * If this is a task context, we use the task's task clock, + * or for a per-cpu context, we use the cpu clock. + */ +static u64 get_context_time(struct perf_counter_context *ctx, int update) +{ + struct task_struct *curr = ctx->task; + + if (!curr) + return cpu_clock(smp_processor_id()); + + return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx, int update) +{ + ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + counter->total_time_enabled = ctx->time_now - + counter->tstamp_enabled; + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time_now; + counter->total_time_running = run_end - counter->tstamp_running; + } +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + /* * Cross CPU call to disable a performance counter */ @@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx, 1); + update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } spin_unlock_irq(&ctx->lock); } @@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } + counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + if (!is_software_counter(counter)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; + counter->tstamp_enabled = ctx->time_now; + counter->tstamp_running = ctx->time_now; + counter->tstamp_stopped = ctx->time_now; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); /* * Protect the list operation against NMI by disabling the @@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info) */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - ctx->nr_counters++; - counter->prev_state = PERF_COUNTER_STATE_OFF; + add_counter_to_ctx(counter, ctx); /* * Don't put the counter on if it is disabled or if @@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } if (!err && !ctx->task && cpuctx->max_pertask) @@ -548,10 +621,8 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) { - list_add_counter(counter, ctx); - ctx->nr_counters++; - } + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); spin_unlock_irq(&ctx->lock); } @@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } unlock: @@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; + } out: spin_unlock_irq(&ctx->lock); } @@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; + update_context_time(ctx, 0); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; + /* + * Add any time since the last sched_out to the lost time + * so it doesn't get included in the total_time_enabled and + * total_time_running measures for counters in the context. + */ + ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + flags = hw_perf_save_disable(); /* @@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_ERROR; + } } list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -902,8 +990,10 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) + if (counter->state != PERF_COUNTER_STATE_ERROR) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } } hw_perf_restore(perf_flags); @@ -946,6 +1036,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; unsigned long flags; curr_rq_lock_irq_save(&flags); + if (ctx->is_active) + update_context_time(ctx, 1); counter->hw_ops->read(counter); + update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); } @@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); } return atomic64_read(&counter->count); @@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 cntval; - - if (count < sizeof(cntval)) - return -EINVAL; + u64 values[3]; + int n; /* * Return end-of-file for a read on a counter that is in @@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return 0; mutex_lock(&counter->mutex); - cntval = perf_counter_read(counter); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); mutex_unlock(&counter->mutex); - return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; } static ssize_t @@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter, * Link it up in the child's context: */ child_counter->task = child; - list_add_counter(child_counter, child_ctx); - child_ctx->nr_counters++; + add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; /* @@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter, * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); /* * Remove this counter from the parent's list @@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child, if (child != current) { wait_task_inactive(child, 0); list_del_init(&child_counter->list_entry); + update_counter_times(child_counter); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); group_sched_out(child_counter, cpuctx, child_ctx); + update_counter_times(child_counter); list_del_init(&child_counter->list_entry); -- cgit v1.2.3 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/timer.c | 3 ++ 2 files changed, 123 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b862a7988c..f70ff80e79d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void perf_pending_sync(struct perf_counter *counter); + static void free_counter(struct perf_counter *counter) { + perf_pending_sync(counter); + if (counter->destroy) counter->destroy(counter); @@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = { .mmap = perf_mmap, }; +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) { + (void)atomic_xchg(&data->wakeup, POLL_IN); + __perf_counter_update_userpage(counter, data); + } + rcu_read_unlock(); + + wake_up_all(&counter->waitq); +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_counter *counter) +{ + struct perf_wakeup_entry **head; + struct perf_wakeup_entry *prev, *next; + + if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + return; + + head = &get_cpu_var(perf_wakeup_head); + + do { + prev = counter->wakeup.next = *head; + next = &counter->wakeup; + } while (cmpxchg(head, prev, next) != prev); + + set_perf_counter_pending(); + + put_cpu_var(perf_wakeup_head); +} + +static int __perf_pending_run(void) +{ + struct perf_wakeup_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + struct perf_counter *counter = container_of(list, + struct perf_counter, wakeup); + + list = list->next; + + counter->wakeup.next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + perf_counter_wakeup(counter); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->wakeup.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + /* * Output */ @@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { - (void)atomic_xchg(&handle->data->wakeup, POLL_IN); - __perf_counter_update_userpage(handle->counter, handle->data); - if (nmi) { - handle->counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&handle->counter->waitq); + if (nmi) + perf_pending_queue(handle->counter); + else + perf_counter_wakeup(handle->counter); } rcu_read_unlock(); } @@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; - counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; counter->ctx = ctx; diff --git a/kernel/timer.c b/kernel/timer.c index b4555568b4e..672ca25fbc4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); + perf_counter_do_pending(); + hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) -- cgit v1.2.3 From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:03 +0200 Subject: perf_counter: fix update_userpage() It just occured to me it is possible to have multiple contending updates of the userpage (mmap information vs overflow vs counter). This would break the seqlock logic. It appear the arch code uses this from NMI context, so we cannot possibly serialize its use, therefore separate the data_head update from it and let it return to its original use. The arch code needs to make sure there are no contending callers by disabling the counter before using it -- powerpc appears to do this nicely. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.241410660@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f70ff80e79d..c95e92329b9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -static void __perf_counter_update_userpage(struct perf_counter *counter, - struct perf_mmap_data *data) +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_counter_mmap_page *userpg = data->user_page; + struct perf_mmap_data *data; + struct perf_counter_mmap_page *userpg; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + userpg = data->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; preempt_enable(); -} - -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - __perf_counter_update_userpage(counter, data); +unlock: rcu_read_unlock(); } @@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter) data = rcu_dereference(counter->data); if (data) { (void)atomic_xchg(&data->wakeup, POLL_IN); - __perf_counter_update_userpage(counter, data); + /* + * Ensure all data writes are issued before updating the + * user-space data head information. The matching rmb() + * will be in userspace after reading this value. + */ + smp_wmb(); + data->user_page->data_head = atomic_read(&data->head); } rcu_read_unlock(); -- cgit v1.2.3 From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:05 +0200 Subject: perf_counter: executable mmap() information Currently the profiling information returns userspace IPs but no way to correlate them to userspace code. Userspace could look into /proc/$pid/maps but that might not be current or even present anymore at the time of analyzing the IPs. Therefore provide means to track the mmap information and provide it in the output stream. XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Andrew Morton Orig-LKML-Reference: <20090330171023.417259499@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c95e92329b9..f35e89e3d6a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -1843,6 +1844,150 @@ void perf_counter_output(struct perf_counter *counter, } } +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct file *file; + char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size); + + if (ret) + return; + + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle, 0); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->hw_event.mmap && + mmap_event->event.header.type == PERF_EVENT_MMAP) + return 1; + + if (counter->hw_event.munmap && + mmap_event->event.header.type == PERF_EVENT_MUNMAP) + return 1; + + return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct file *file = mmap_event->file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = dentry_path(file->f_dentry, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name), sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + + kfree(buf); +} + +void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MUNMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + /* * Generic software counter infrastructure */ -- cgit v1.2.3 From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:08 +0200 Subject: perf_counter: make it possible for hw_perf_counter_init to return error codes Impact: better error reporting At present, if hw_perf_counter_init encounters an error, all it can do is return NULL, which causes sys_perf_counter_open to return an EINVAL error to userspace. This isn't very informative for userspace; it means that userspace can't tell the difference between "sorry, oprofile is already using the PMU" and "we don't support this CPU" and "this CPU doesn't support the requested generic hardware event". This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let hw_perf_counter_init return an error code on error rather than just NULL if it wishes. If it does so, that error code will be returned from sys_perf_counter_open to userspace. If it returns NULL, an EINVAL error will be returned to userspace, as before. This also adapts the powerpc hw_perf_counter_init to make use of this to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate. It would be good to add extra error numbers in future to allow userspace to distinguish the various errors that are currently reported as EINVAL, i.e. irq_period < 0, too many events in a group, conflict between exclude_* settings in a group, and PMU resource conflict in a group. [ v2: fix a bug pointed out by Corey Ashford where error returns from hw_perf_counter_init were not handled correctly in the case of raw hardware events.] Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.682428180@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f35e89e3d6a..d07b45278b4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2453,10 +2453,11 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, { const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; + long err; counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) - return NULL; + return ERR_PTR(-ENOMEM); /* * Single counters are their own group leaders, with an @@ -2505,12 +2506,18 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = tp_perf_counter_init(counter); break; } +done: + err = 0; + if (!hw_ops) + err = -EINVAL; + else if (IS_ERR(hw_ops)) + err = PTR_ERR(hw_ops); - if (!hw_ops) { + if (err) { kfree(counter); - return NULL; + return ERR_PTR(err); } -done: + counter->hw_ops = hw_ops; return counter; @@ -2583,10 +2590,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, GFP_KERNEL); - if (!counter) + ret = PTR_ERR(counter); + if (IS_ERR(counter)) goto err_put_context; ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -2658,8 +2665,8 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->hw_event, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); - if (!child_counter) - return NULL; + if (IS_ERR(child_counter)) + return child_counter; /* * Link it up in the child's context: @@ -2710,15 +2717,17 @@ static int inherit_group(struct perf_counter *parent_counter, { struct perf_counter *leader; struct perf_counter *sub; + struct perf_counter *child_ctr; leader = inherit_counter(parent_counter, parent, parent_ctx, child, NULL, child_ctx); - if (!leader) - return -ENOMEM; + if (IS_ERR(leader)) + return PTR_ERR(leader); list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - if (!inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx)) - return -ENOMEM; + child_ctr = inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); } return 0; } -- cgit v1.2.3 From 78d613eb129fc4edf0e2cabfcc6a4c5285482d21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:11 +0200 Subject: perf_counter: small cleanup of the output routines Move the nmi argument to the _begin() function, so that _end() only needs the handle. This allows the _begin() function to generate a wakeup on event loss. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.959404268@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d07b45278b4..4471e7e2c10 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1663,10 +1663,20 @@ struct perf_output_handle { unsigned int offset; unsigned int head; int wakeup; + int nmi; }; +static inline void __perf_output_wakeup(struct perf_output_handle *handle) +{ + if (handle->nmi) + perf_pending_queue(handle->counter); + else + perf_counter_wakeup(handle->counter); +} + static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size) + struct perf_counter *counter, unsigned int size, + int nmi) { struct perf_mmap_data *data; unsigned int offset, head; @@ -1676,15 +1686,17 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; + handle->counter = counter; + handle->nmi = nmi; + if (!data->nr_pages) - goto out; + goto fail; do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->counter = counter; handle->data = data; handle->offset = offset; handle->head = head; @@ -1692,6 +1704,8 @@ static int perf_output_begin(struct perf_output_handle *handle, return 0; +fail: + __perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1733,14 +1747,10 @@ static void perf_output_copy(struct perf_output_handle *handle, #define perf_output_put(handle, x) \ perf_output_copy((handle), &(x), sizeof(x)) -static void perf_output_end(struct perf_output_handle *handle, int nmi) +static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) { - if (nmi) - perf_pending_queue(handle->counter); - else - perf_counter_wakeup(handle->counter); - } + if (handle->wakeup) + __perf_output_wakeup(handle); rcu_read_unlock(); } @@ -1750,12 +1760,12 @@ static int perf_output_write(struct perf_counter *counter, int nmi, struct perf_output_handle handle; int ret; - ret = perf_output_begin(&handle, counter, size); + ret = perf_output_begin(&handle, counter, size, nmi); if (ret) goto out; perf_output_copy(&handle, buf, size); - perf_output_end(&handle, nmi); + perf_output_end(&handle); out: return ret; @@ -1804,7 +1814,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi) size = sizeof(header) + counter->nr_siblings * sizeof(entry); - ret = perf_output_begin(&handle, counter, size); + ret = perf_output_begin(&handle, counter, size, nmi); if (ret) return; @@ -1824,7 +1834,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi) perf_output_put(&handle, entry); } - perf_output_end(&handle, nmi); + perf_output_end(&handle); } void perf_counter_output(struct perf_counter *counter, @@ -1869,7 +1879,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, { struct perf_output_handle handle; int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size); + int ret = perf_output_begin(&handle, counter, size, 0); if (ret) return; @@ -1877,7 +1887,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, perf_output_put(&handle, mmap_event->event); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); - perf_output_end(&handle, 0); + perf_output_end(&handle); } static int perf_counter_mmap_match(struct perf_counter *counter, -- cgit v1.2.3 From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:12 +0200 Subject: perf_counter: re-arrange the perf_event_type Breaks ABI yet again :-) Change the event type so that [0, 2^31-1] are regular event types, but [2^31, 2^32-1] forms a bitmask for overflow events. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.047961770@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 56 +++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4471e7e2c10..d93e9ddf784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) -{ - struct perf_output_handle handle; - int ret; - - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - goto out; - - perf_output_copy(&handle, buf, size); - perf_output_end(&handle); - -out: - return ret; -} - static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - unsigned int size; + int ret; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; struct { - struct perf_event_header header; - u64 ip; u32 pid, tid; - } event; + } tid_entry; - event.header.type = PERF_EVENT_IP; - event.ip = instruction_pointer(regs); + header.type = PERF_EVENT_OVERFLOW; + header.size = sizeof(header); - size = sizeof(event); + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); if (counter->hw_event.include_tid) { /* namespace issues */ - event.pid = current->group_leader->pid; - event.tid = current->pid; + tid_entry.pid = current->group_leader->pid; + tid_entry.tid = current->pid; - event.header.type |= __PERF_EVENT_TID; - } else - size -= sizeof(u64); + header.type |= __PERF_EVENT_TID; + header.size += sizeof(tid_entry); + } - event.header.size = size; + ret = perf_output_begin(&handle, counter, header.size, nmi); + if (ret) + return; + + perf_output_put(&handle, header); + perf_output_put(&handle, ip); + + if (counter->hw_event.include_tid) + perf_output_put(&handle, tid_entry); - perf_output_write(counter, nmi, &event, size); + perf_output_end(&handle); } static void perf_output_group(struct perf_counter *counter, int nmi) -- cgit v1.2.3 From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:14 +0200 Subject: perf_counter: provide generic callchain bits Provide the generic callchain support bits. If hw_event->callchain is set the arch specific perf_callchain() function is called upon to provide a perf_callchain_entry structure filled with the current callchain. If it does so, it is added to the overflow output event. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.254266860@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d93e9ddf784..860cdc26bd7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1653,6 +1653,17 @@ void perf_counter_do_pending(void) __perf_pending_run(); } +/* + * Callchain support -- arch specific + */ + +struct perf_callchain_entry * +__attribute__((weak)) +perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + /* * Output */ @@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter, struct { u32 pid, tid; } tid_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; header.type = PERF_EVENT_OVERFLOW; header.size = sizeof(header); @@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (counter->hw_event.callchain) { + callchain = perf_callchain(regs); + + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= __PERF_EVENT_CALLCHAIN; + header.size += callchain_size; + } + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter, if (counter->hw_event.include_tid) perf_output_put(&handle, tid_entry); + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + perf_output_end(&handle); } -- cgit v1.2.3 From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:11:59 +0200 Subject: perf_counter: move the event overflow output bits to record_type Per suggestion from Paul, move the event overflow bits to record_type and sanitize the enums a bit. Breaks the ABI -- again ;-) Suggested-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.151921176@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 101 ++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 860cdc26bd7..995063df910 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void perf_output_simple(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; + u64 record_type = counter->hw_event.record_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; struct { u32 pid, tid; } tid_entry; + struct { + u64 event; + u64 counter; + } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; - header.type = PERF_EVENT_OVERFLOW; + header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); - ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; - header.size += sizeof(ip); + if (record_type & PERF_RECORD_IP) { + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); + } - if (counter->hw_event.include_tid) { + if (record_type & PERF_RECORD_TID) { /* namespace issues */ tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; @@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } - if (counter->hw_event.callchain) { + if (record_type & PERF_RECORD_GROUP) { + header.type |= __PERF_EVENT_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (record_type & PERF_RECORD_CALLCHAIN) { callchain = perf_callchain(regs); if (callchain) { @@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter, return; perf_output_put(&handle, header); - perf_output_put(&handle, ip); - if (counter->hw_event.include_tid) - perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_IP) + perf_output_put(&handle, ip); - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - - perf_output_end(&handle); -} - -static void perf_output_group(struct perf_counter *counter, int nmi) -{ - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_counter *leader, *sub; - unsigned int size; - struct { - u64 event; - u64 counter; - } entry; - int ret; - - size = sizeof(header) + counter->nr_siblings * sizeof(entry); + if (record_type & PERF_RECORD_TID) + perf_output_put(&handle, tid_entry); - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - return; + if (record_type & PERF_RECORD_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; - header.type = PERF_EVENT_GROUP; - header.size = size; + perf_output_put(&handle, nr); - perf_output_put(&handle, header); + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); + group_entry.event = sub->hw_event.config; + group_entry.counter = atomic64_read(&sub->count); - entry.event = sub->hw_event.config; - entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, entry); + perf_output_put(&handle, group_entry); + } } - perf_output_end(&handle); -} - -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return; - - case PERF_RECORD_IRQ: - perf_output_simple(counter, nmi, regs); - break; + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); - case PERF_RECORD_GROUP: - perf_output_group(counter, nmi); - break; - } + perf_output_end(&handle); } /* -- cgit v1.2.3 From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:01 +0200 Subject: perf_counter: per event wakeups By request, provide a way to request a wakeup every 'n' events instead of every page of output. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.323309784@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 995063df910..9bcab10e735 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) + int wakeup_events = handle->counter->hw_event.wakeup_events; + + if (wakeup_events) { + int events = atomic_inc_return(&handle->data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &handle->data->events); + __perf_output_wakeup(handle); + } + } else if (handle->wakeup) __perf_output_wakeup(handle); rcu_read_unlock(); } -- cgit v1.2.3 From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9bcab10e735..f105a6e696c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); + callchain_size = (2 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; -- cgit v1.2.3 From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:04 +0200 Subject: perf_counter: update mmap() counter read Paul noted that we don't need SMP barriers for the mmap() counter read because its always on the same cpu (otherwise you can't access the hw counter anyway). So remove the SMP barriers and replace them with regular compiler barriers. Further, update the comment to include a race free method of reading said hardware counter. The primary change is putting the pmc_read inside the seq-loop, otherwise we can still race and read rubbish. Noticed-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.577951445@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f105a6e696c..2a5d4f52556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter) */ preempt_disable(); ++userpg->lock; - smp_wmb(); + barrier(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - smp_wmb(); + barrier(); ++userpg->lock; preempt_enable(); unlock: -- cgit v1.2.3 From ca5f9524d61f54b1f618293ab92fc6b49cac864d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:33 -0700 Subject: futex: separate futex_wait_queue_me() logic from futex_wait() Refactor futex_wait() in preparation for futex_wait_requeue_pi(). In order to reuse a good chunk of the futex_wait() code for the upcoming futex_wait_requeue_pi() function, this patch breaks out the queue-to-wakeup section of futex_wait() into futex_wait_queue_me(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 138 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6b50a024bca..ebb48d6d1a8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1115,24 +1115,87 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + * @wait: the wait_queue to add to the futex_q after queueing in the hb + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout, + wait_queue_t *wait) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + + /* add_wait_queue is the barrier after __set_current_state. */ + __set_current_state(TASK_INTERRUPTIBLE); + + /* + * Add current as the futex_q waiter. We don't remove ourselves from + * the wait_queue because we are the only user of it. + */ + add_wait_queue(&q->waiter, wait); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { - struct task_struct *curr = current; + struct hrtimer_sleeper timeout, *to = NULL; + DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; - struct hrtimer_sleeper t; - int rem = 0; if (!bitset) return -EINVAL; q.pi_state = NULL; q.bitset = bitset; + + if (abs_time) { + to = &timeout; + + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1178,75 +1241,22 @@ retry_private: goto retry; } ret = -EWOULDBLOCK; + + /* Only actually queue if *uaddr contained val. */ if (unlikely(uval != val)) { queue_unlock(&q, hb); goto out_put_key; } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); - - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - - /* Flag if a timeout occured */ - rem = (t.task == NULL); - - destroy_hrtimer_on_stack(&t.timer); - } - } - __set_current_state(TASK_RUNNING); - - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to, &wait); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1275,6 +1285,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } -- cgit v1.2.3 From 4b1c486b3587d2abf50bee4a05eb488cd4045f2c Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:42 -0700 Subject: futex: add helper to find the top prio waiter of a futex Improve legibility by wrapping finding the top waiter in a function. This will be used by the follow-on patches for enabling requeue pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ebb48d6d1a8..421fb5e42a1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -276,6 +276,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; -- cgit v1.2.3 From 1a52084d0919c2799258737c21fb328a9de159b5 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:52 -0700 Subject: futex: split out atomic logic from futex_lock_pi() Refactor the atomic portion of futex_lock_pi() into futex_lock_pi_atomic(). This logic will be needed by requeue_pi, so modularize it to reduce code duplication. The only significant change is passing of the task to try and take the lock for. This simplifies the -EDEADLK test as if the lock is owned by task t, it's a deadlock, regardless of if we are doing requeue pi or not. This patch updates the corresponding comment accordingly. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 224 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 130 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 421fb5e42a1..986b16e4453 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -556,6 +556,127 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the lookup + * @task: the task to perform the atomic lock work for. This will be + * "current" except in the case of requeue pi. + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -1340,9 +1461,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1365,81 +1486,15 @@ retry: retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1449,25 +1504,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } -- cgit v1.2.3 From dd9739980b50c8cde33e1f8eb08b7e0140bcd61e Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:02 -0700 Subject: futex: split out fixup owner logic from futex_lock_pi() Refactor the post lock acquisition logic from futex_lock_pi(). This code will be reused in futex_wait_requeue_pi(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 158 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 89 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 986b16e4453..af831fbb7fb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1255,6 +1255,79 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) +{ + struct task_struct *owner; + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } + + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + goto out; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + /** * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller @@ -1459,11 +1532,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval; struct futex_q q; - int ret; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1527,71 +1599,21 @@ retry_private: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1599,9 +1621,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1611,7 +1631,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* -- cgit v1.2.3 From 8dac456a681bd94272ff50ecb31be6b669382c2b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:12 -0700 Subject: rt_mutex: add proxy lock routines This patch is a prerequisite for futex requeue_pi. It basically splits rt_mutex_slowlock() right down the middle, just before the first call to schedule(). It further adds helper functions which make use of the split and provide the rt-mutex preliminaries for futex requeue_pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/rtmutex.c | 240 +++++++++++++++++++++++++++++++++++++----------- kernel/rtmutex_common.h | 8 ++ 2 files changed, 195 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..fec77e7e056 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. */ static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) { - struct rt_mutex_waiter waiter; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; - - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - for (;;) { /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) @@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } /* - * waiter.task is NULL the first time we come here and + * waiter->task is NULL the first time we come here and * when we have been woken up by the previous owner * but the lock got stolen by a higher prio task. */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, detect_deadlock); /* * If we got woken up by the owner then start loop * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) { + if (unlikely(!waiter->task)) { /* * Reset the return value. We might * have returned with -EDEADLK and the @@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, spin_unlock(&lock->wait_lock); - debug_rt_mutex_print_deadlock(&waiter); + debug_rt_mutex_print_deadlock(waiter); - if (waiter.task) + if (waiter->task) schedule_rt_mutex(lock); spin_lock(&lock->wait_lock); set_current_state(state); } + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) @@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_deadlock_account_unlock(proxy_owner); } +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + /** * rt_mutex_next_owner - return the next owner of the lock * @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800e..97a2f81866a 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" -- cgit v1.2.3 From a72188d8a64ebe74722f1cf7ffac41b41ffdba21 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:22 -0700 Subject: futex: add FUTEX_HAS_TIMEOUT flag to restart.futex.flags Currently restart is only used if there is a timeout. The requeue_pi functionality requires restarting to futex_lock_pi() on signal after wakeup in futex_wait_requeue_pi() regardless of if there was a timeout or not. Using 0 for the timeout value is confusing as that could indicate an expired timer. The flag makes this explicit. While the check is not technically needed in futex_wait_restart(), doing so makes the code consistent with and will avoid confusion should the need arise to restart wait without a timeout. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index af831fbb7fb..6b597cf33b0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1252,6 +1252,7 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); @@ -1486,7 +1487,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1510,13 +1511,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } -- cgit v1.2.3 From 9121e4783cd5c7e2a407763f3b61c2d573891133 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:31 -0700 Subject: futex: distangle futex_requeue() futex_requeue() is getting a bit long-winded, and will be getting more so after the requeue_pi patch. Factor out the actual requeueing into a nicely contained inline function to reduce function length and improve legibility. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6b597cf33b0..e76942e2a79 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -940,6 +940,34 @@ out: return ret; } +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + /* * Requeue all waiters hashed on one physical page to another * physical page. @@ -999,20 +1027,7 @@ retry_private: if (++ret <= nr_wake) { wake_futex(this); } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); + requeue_futex(this, hb1, hb2, &key2); drop_count++; if (ret - nr_wake >= nr_requeue) -- cgit v1.2.3 From f801073f87aa22ddf0e9146355fec3993163790f Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:40 -0700 Subject: futex: split out futex value validation code Refactor the code to validate the expected futex value in order to reuse it with the requeue_pi code. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 116 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e76942e2a79..dbe857aa438 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1398,42 +1398,29 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, __set_current_state(TASK_RUNNING); } -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) { - struct hrtimer_sleeper timeout, *to = NULL; - DECLARE_WAITQUEUE(wait, current); - struct restart_block *restart; - struct futex_hash_bucket *hb; - struct futex_q q; u32 uval; int ret; - if (!bitset) - return -EINVAL; - - q.pi_state = NULL; - q.bitset = bitset; - - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) - goto out; - -retry_private: - hb = queue_lock(&q); - /* * Access the page AFTER the hash-bucket is locked. * Order is important: @@ -1450,33 +1437,74 @@ retry_private: * A consequence is that futex_wait() can return zero and absorb * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key); + if (unlikely(ret != 0)) + goto out; + +retry_private: + *hb = queue_lock(q); + ret = get_futex_value_locked(&uval, uaddr); - if (unlikely(ret)) { - queue_unlock(&q, hb); + if (ret) { + queue_unlock(q, *hb); ret = get_user(uval, uaddr); if (ret) - goto out_put_key; + goto out; if (!fshared) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(fshared, &q->key); goto retry; } - ret = -EWOULDBLOCK; - /* Only actually queue if *uaddr contained val. */ - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; } +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} + +static int futex_wait(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +{ + struct hrtimer_sleeper timeout, *to = NULL; + DECLARE_WAITQUEUE(wait, current); + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q; + int ret; + + if (!bitset) + return -EINVAL; + + q.pi_state = NULL; + q.bitset = bitset; + + if (abs_time) { + to = &timeout; + + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out; + /* queue_me and wait for wakeup, timeout, or a signal. */ futex_wait_queue_me(hb, &q, to, &wait); -- cgit v1.2.3 From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:49 -0700 Subject: futex: add requeue_pi functionality PI Futexes and their underlying rt_mutex cannot be left ownerless if there are pending waiters as this will break the PI boosting logic, so the standard requeue commands aren't sufficient. The new commands properly manage pi futex ownership by ensuring a futex with waiters has an owner at all times. This will allow glibc to properly handle pi mutexes with pthread_condvars. The approach taken here is to create two new futex op codes: FUTEX_WAIT_REQUEUE_PI: Tasks will use this op code to wait on a futex (such as a non-pi waitqueue) and wake after they have been requeued to a pi futex. Prior to returning to userspace, they will acquire this pi futex (and the underlying rt_mutex). futex_wait_requeue_pi() is the result of a high speed collision between futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being done by futex_proxy_trylock_atomic() on behalf of the top_waiter). FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI): This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI, regardless of how many tasks the caller intends to wake or requeue. pthread_cond_broadcast() should call this with nr_wake=1 and nr_requeue=INT_MAX. pthread_cond_signal() should call this with nr_wake=1 and nr_requeue=0. The reason being we need both callers to get the benefit of the futex_proxy_trylock_atomic() routine. futex_requeue() also enqueues the top_waiter on the rt_mutex via rt_mutex_start_proxy_lock(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 519 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 500 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index dbe857aa438..185c981d89e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -109,6 +113,9 @@ struct futex_q { struct futex_pi_state *pi_state; struct task_struct *task; + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; + /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up(&q->waiter); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps) +{ + struct futex_q *top_waiter; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. + * The pi_state is returned in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -1020,19 +1145,94 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* Attempt to acquire uaddr2 and wake the top_waiter. */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) continue; - if (++ret <= nr_wake) { + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { wake_futex(this); - } else { - requeue_futex(this, hb1, hb2, &key2); - drop_count++; + continue; + } - if (ret - nr_wake >= nr_requeue) - break; + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -1047,7 +1247,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -1270,6 +1472,7 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); +static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; + q.rt_waiter = NULL; if (abs_time) { to = &timeout; @@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1701,6 +1906,20 @@ uaddr_faulted: goto retry; } +static long futex_lock_pi_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + ktime_t t, *tp = NULL; + int fshared = restart->futex.flags & FLAGS_SHARED; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); +} /* * Userspace attempted a TID -> 0 atomic transition, and failed. @@ -1803,6 +2022,253 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else { + /* + * We expect signal_pending(current), but another + * thread may have handled it for us already. + */ + /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if + * the user specified SA_RESTART or not? */ + ret = -ERESTARTSYS; + } + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + DECLARE_WAITQUEUE(wait, current); + struct restart_block *restart; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + u32 uval; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) { + put_futex_key(fshared, &key2); + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to, &wait); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + ret = -EFAULT; + if (get_user(uval, uaddr2)) + goto out_put_keys; + + /* + * We've already been requeued, so restart by calling + * futex_lock_pi() directly, rather then returning to this + * function. + */ + ret = -ERESTART_RESTARTBLOCK; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->futex.uaddr = (u32 *)uaddr2; + restart->futex.val = uval; + restart->futex.flags = 0; + if (abs_time) { + restart->futex.flags |= FLAGS_HAS_TIMEOUT; + restart->futex.time = abs_time->tv64; + } + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + } + +out_put_keys: + put_futex_key(fshared, &q.key); + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; -- cgit v1.2.3 From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:00 +0200 Subject: perf_counter: add more context information Change the callchain context entries to u16, so as to gain some space. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.457320003@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2a5d4f52556..727624db507 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void) * Callchain support -- arch specific */ -struct perf_callchain_entry * -__attribute__((weak)) -perf_callchain(struct pt_regs *regs) +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) { return NULL; } @@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (2 + callchain->nr) * sizeof(u64); + callchain_size = (1 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; -- cgit v1.2.3 From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:01 +0200 Subject: perf_counter: SIGIO support Provide support for fcntl() I/O availability signals. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.579788800@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 727624db507..c58cc64319e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1526,6 +1526,22 @@ out: return ret; } +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct perf_counter *counter = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, @@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = { .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, .mmap = perf_mmap, + .fasync = perf_fasync, }; /* @@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_lock(); data = rcu_dereference(counter->data); if (data) { - (void)atomic_xchg(&data->wakeup, POLL_IN); + atomic_set(&data->wakeup, POLL_IN); /* * Ensure all data writes are issued before updating the * user-space data head information. The matching rmb() @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); + kill_fasync(&counter->fasync, SIGIO, POLL_IN); } /* -- cgit v1.2.3 From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:02 +0200 Subject: perf_counter: generalize pending infrastructure Prepare the pending infrastructure to do more than wakeups. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.634732847@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 53 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c58cc64319e..0a2ade2e4f1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } +static void perf_pending_wakeup(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + perf_counter_wakeup(counter); +} + /* * Pending wakeups * @@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter) * single linked list and use cmpxchg() to add entries lockless. */ -#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) -static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { PENDING_TAIL, }; -static void perf_pending_queue(struct perf_counter *counter) +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) { - struct perf_wakeup_entry **head; - struct perf_wakeup_entry *prev, *next; + struct perf_pending_entry **head; - if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) return; - head = &get_cpu_var(perf_wakeup_head); + entry->func = func; + + head = &get_cpu_var(perf_pending_head); do { - prev = counter->wakeup.next = *head; - next = &counter->wakeup; - } while (cmpxchg(head, prev, next) != prev); + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); set_perf_counter_pending(); - put_cpu_var(perf_wakeup_head); + put_cpu_var(perf_pending_head); } static int __perf_pending_run(void) { - struct perf_wakeup_entry *list; + struct perf_pending_entry *list; int nr = 0; - list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); while (list != PENDING_TAIL) { - struct perf_counter *counter = container_of(list, - struct perf_counter, wakeup); + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; list = list->next; - counter->wakeup.next = NULL; + func = entry->func; + entry->next = NULL; /* * Ensure we observe the unqueue before we issue the wakeup, * so that we won't be waiting forever. @@ -1636,7 +1646,7 @@ static int __perf_pending_run(void) */ smp_wmb(); - perf_counter_wakeup(counter); + func(entry); nr++; } @@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter) * so that we do not miss the wakeup. -- see perf_pending_handle() */ smp_rmb(); - return counter->wakeup.next == NULL; + return counter->pending.next == NULL; } static void perf_pending_sync(struct perf_counter *counter) @@ -1695,9 +1705,10 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { - if (handle->nmi) - perf_pending_queue(handle->counter); - else + if (handle->nmi) { + perf_pending_queue(&handle->counter->pending, + perf_pending_wakeup); + } else perf_counter_wakeup(handle->counter); } -- cgit v1.2.3 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0a2ade2e4f1..195e976eb07 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +static void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; u64 record_type = counter->hw_event.record_type; @@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_counter_output(counter, nmi, regs); + return 0; +} + /* * Generic software counter infrastructure */ @@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { + enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; @@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) !counter->hw_event.exclude_user) regs = task_pt_regs(current); - if (regs) - perf_counter_output(counter, 0, regs); + if (regs) { + if (perf_counter_overflow(counter, 0, regs)) + ret = HRTIMER_NORESTART; + } hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); - return HRTIMER_RESTART; + return ret; } static void perf_swcounter_overflow(struct perf_counter *counter, @@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + /* soft-disable the counter */ + ; + } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3 From ebb3c4c4cb81d64cc041356915ec015e2c57092a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:05 +0200 Subject: perf_counter: fix the mlock accounting Reading through the code I saw I forgot the finish the mlock accounting. Do so now. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.899767331@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 195e976eb07..c841563de04 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1461,13 +1461,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, + .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, }; @@ -1499,24 +1500,32 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; - locked = vma_size >> PAGE_SHIFT; - locked += vma->vm_mm->locked_vm; + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) { + if (nr_pages != counter->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + locked = vma->vm_mm->locked_vm; + locked += nr_pages + 1; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) - return -EPERM; - - mutex_lock(&counter->mmap_mutex); - if (atomic_inc_not_zero(&counter->mmap_count)) - goto out; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } WARN_ON(counter->data); ret = perf_mmap_data_alloc(counter, nr_pages); - if (!ret) - atomic_set(&counter->mmap_count, 1); -out: + if (ret) + goto unlock; + + atomic_set(&counter->mmap_count, 1); + vma->vm_mm->locked_vm += nr_pages + 1; +unlock: mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; -- cgit v1.2.3 From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:06 +0200 Subject: perf_counter: PERF_RECORD_TIME By popular request, provide means to log a timestamp along with the counter overflow event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.024173282@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c841563de04..19990d1f021 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter, } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; + u64 time; header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); @@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter, } } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= __PERF_EVENT_TIME; + header.size += sizeof(u64); + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + perf_output_end(&handle); } -- cgit v1.2.3 From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:07 +0200 Subject: perf_counter: counter overflow limit Provide means to auto-disable the counter after 'n' overflow events. Create the counter with hw_event.disabled = 1, and then issue an ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable the counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.083139737@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 19990d1f021..c05e10354bc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } +static void perf_counter_refresh(struct perf_counter *counter, int refresh) +{ + atomic_add(refresh, &counter->event_limit); + perf_counter_enable(counter); +} + /* * Enable a counter and all its children. */ @@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_DISABLE: perf_counter_disable_family(counter); break; + case PERF_COUNTER_IOC_REFRESH: + perf_counter_refresh(counter, arg); + break; default: err = -ENOTTY; } @@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } -static void perf_pending_wakeup(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - perf_counter_wakeup(counter); -} - /* * Pending wakeups * @@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry) * single linked list and use cmpxchg() to add entries lockless. */ +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { @@ -1715,8 +1732,9 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { if (handle->nmi) { + handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, - perf_pending_wakeup); + perf_pending_counter); } else perf_counter_wakeup(handle->counter); } @@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + int events = atomic_read(&counter->event_limit); + int ret = 0; + + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); + } + perf_counter_output(counter, nmi, regs); - return 0; + return ret; } /* -- cgit v1.2.3 From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:09 +0200 Subject: perf_counter: change event definition Currently the definition of an event is slightly ambiguous. We have wakeup events, for poll() and SIGIO, which are either generated when a record crosses a page boundary (hw_events.wakeup_events == 0), or every wakeup_events new records. Now a record can be either a counter overflow record, or a number of different things, like the mmap PROT_EXEC region notifications. Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only considers counter overflows. This patch changes then wakeup_events and SIGIO notification to only consider overflow events. Furthermore it changes the SIGIO notification to report SIGHUP when the event limit is reached and the counter will be disabled. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.266679874@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c05e10354bc..8c8eaf0625f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); - kill_fasync(&counter->fasync, SIGIO, POLL_IN); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } } /* @@ -1727,6 +1731,7 @@ struct perf_output_handle { unsigned int head; int wakeup; int nmi; + int overflow; }; static inline void __perf_output_wakeup(struct perf_output_handle *handle) @@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi) + int nmi, int overflow) { struct perf_mmap_data *data; unsigned int offset, head; @@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->counter = counter; - handle->nmi = nmi; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; if (!data->nr_pages) goto fail; @@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle) { int wakeup_events = handle->counter->hw_event.wakeup_events; - if (wakeup_events) { + if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&handle->data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &handle->data->events); @@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } - ret = perf_output_begin(&handle, counter, header.size, nmi); + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, { struct perf_output_handle handle; int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0); + int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; @@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&counter->event_limit)) { ret = 1; + counter->pending_kill = POLL_HUP; if (nmi) { counter->pending_disable = 1; perf_pending_queue(&counter->pending, -- cgit v1.2.3 From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:10 +0200 Subject: perf_counter: rework context time Since perf_counter_context is switched along with tasks, we can maintain the context time without using the task runtime clock. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.353552838@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 78 ++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8c8eaf0625f..84d85ab4e16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_stopped = ctx->time; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -253,27 +253,20 @@ retry: spin_unlock_irq(&ctx->lock); } -/* - * Get the current time for this context. - * If this is a task context, we use the task's task clock, - * or for a per-cpu context, we use the cpu clock. - */ -static u64 get_context_time(struct perf_counter_context *ctx, int update) +static inline u64 perf_clock(void) { - struct task_struct *curr = ctx->task; - - if (!curr) - return cpu_clock(smp_processor_id()); - - return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; + return cpu_clock(smp_processor_id()); } /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_counter_context *ctx, int update) +static void update_context_time(struct perf_counter_context *ctx) { - ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; } /* @@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - counter->total_time_enabled = ctx->time_now - - counter->tstamp_enabled; - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time_now; - counter->total_time_running = run_end - counter->tstamp_running; - } + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; } /* @@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx, 1); + update_context_time(ctx); update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); @@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + counter->tstamp_running += ctx->time - counter->tstamp_stopped; if (!is_software_counter(counter)) cpuctx->active_oncpu++; @@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter, list_add_counter(counter, ctx); ctx->nr_counters++; counter->prev_state = PERF_COUNTER_STATE_OFF; - counter->tstamp_enabled = ctx->time_now; - counter->tstamp_running = ctx->time_now; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; } /* @@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); /* * Protect the list operation against NMI by disabling the @@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; } out: spin_unlock_irq(&ctx->lock); @@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; - update_context_time(ctx, 0); + update_context_time(ctx); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; - /* - * Add any time since the last sched_out to the lost time - * so it doesn't get included in the total_time_enabled and - * total_time_running measures for counters in the context. - */ - ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + ctx->timestamp = perf_clock(); flags = hw_perf_save_disable(); @@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1113,7 +1103,7 @@ static void __read(void *info) curr_rq_lock_irq_save(&flags); if (ctx->is_active) - update_context_time(ctx, 1); + update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); -- cgit v1.2.3 From a39d6f2556c4a19f58f538c6aa28bf8faca4fcb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:11 +0200 Subject: perf_counter: rework the task clock software counter Rework the task clock software counter to use the context time instead of the task runtime clock, this removes the last such user. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.445450972@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 84d85ab4e16..56b7eb53d67 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -974,9 +974,6 @@ int perf_counter_task_disable(void) curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -1017,9 +1014,6 @@ int perf_counter_task_enable(void) curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -2347,38 +2341,28 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -/* - * Called from within the scheduler: - */ -static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) -{ - struct task_struct *curr = counter->task; - u64 delta; - - delta = __task_delta_exec(curr, update); - - return curr->se.sum_exec_runtime + delta; -} - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +static void task_clock_perf_counter_update(struct perf_counter *counter) { - u64 prev; + u64 prev, now; s64 delta; - prev = atomic64_read(&counter->hw.prev_count); - - atomic64_set(&counter->hw.prev_count, now); + update_context_time(counter->ctx); + now = counter->ctx->time; + prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); } static int task_clock_perf_counter_enable(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u64 now; + + update_context_time(counter->ctx); + now = counter->ctx->time; - atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { @@ -2393,14 +2377,12 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 0)); + task_clock_perf_counter_update(counter); } static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 1)); + task_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_task_clock = { -- cgit v1.2.3 From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:12 +0200 Subject: perf_counter: remove rq->lock usage Now that all the task runtime clock users are gone, remove the ugly rq->lock usage from perf counters, which solves the nasty deadlock seen when a software task clock counter was read from an NMI overflow context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.531137582@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++++++-------------------------- kernel/sched.c | 20 -------------------- 2 files changed, 16 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 56b7eb53d67..f4f7596f784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); counter_sched_out(counter, cpuctx, ctx); @@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * If the counter is on, turn it off. @@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); /* @@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info) unlock: hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); counter->prev_state = counter->state; @@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info) } unlock: - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -971,7 +963,7 @@ int perf_counter_task_disable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -992,9 +984,7 @@ int perf_counter_task_disable(void) hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); return 0; } @@ -1095,12 +1085,12 @@ static void __read(void *info) struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } parent_counter = child_counter->parent; diff --git a/kernel/sched.c b/kernel/sched.c index f76e3c0188a..0de2f814fb1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void curr_rq_lock_irq_save(unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_save(*flags); - rq = cpu_rq(smp_processor_id()); - spin_lock(&rq->lock); -} - -void curr_rq_unlock_irq_restore(unsigned long *flags) - __releases(rq->lock) -{ - struct rq *rq; - - rq = cpu_rq(smp_processor_id()); - spin_unlock(&rq->lock); - local_irq_restore(*flags); -} - void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); -- cgit v1.2.3 From bce379bf358738ab8efc8cda2594a299ac685085 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:13 +0200 Subject: perf_counter: minimize context time updates Push the update_context_time() calls up the stack so that we get less invokations and thereby a less noisy output: before: # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null Performance counter stats for 'ls': 10.163691 cpu clock ticks (msecs) (scaled from 98.94%) 10.215360 task clock ticks (msecs) (scaled from 98.18%) 10.185549 task clock ticks (msecs) (scaled from 98.53%) 10.183581 task clock ticks (msecs) (scaled from 98.71%) Wall-clock time elapsed: 11.912858 msecs after: # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null Performance counter stats for 'ls': 9.316630 cpu clock ticks (msecs) 9.280789 task clock ticks (msecs) 9.280789 task clock ticks (msecs) 9.280789 task clock ticks (msecs) Wall-clock time elapsed: 9.574872 msecs Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.618876874@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f4f7596f784..863703b3158 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -319,6 +319,8 @@ static void __perf_counter_disable(void *info) spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); + /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. @@ -797,6 +799,8 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) if (likely(!cpuctx->task_ctx)) return; + update_context_time(ctx); + regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); __perf_counter_sched_out(ctx, cpuctx); @@ -2336,7 +2340,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) u64 prev, now; s64 delta; - update_context_time(counter->ctx); now = counter->ctx->time; prev = atomic64_xchg(&counter->hw.prev_count, now); @@ -2349,7 +2352,6 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; u64 now; - update_context_time(counter->ctx); now = counter->ctx->time; atomic64_set(&hwc->prev_count, now); @@ -2372,6 +2374,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { + update_context_time(counter->ctx); task_clock_perf_counter_update(counter); } -- cgit v1.2.3 From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d507..f91bc8141dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } +/* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + /* * wait_task_inactive - wait for a thread to unschedule. * -- cgit v1.2.3 From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:40 +0200 Subject: x86, bts, hw-branch-tracer: add _noirq variants to the debug store interface The hw-branch-tracer uses debug store functions from an on_each_cpu() context, which is simply wrong since the functions may sleep. Add _noirq variants for most functions, which may be called with interrupts disabled. Separate per-cpu and per-task tracing and allow per-cpu tracing to be controlled from any cpu. Make the hw-branch-tracer use the new debug store interface, synchronize with hotplug cpu event using get/put_online_cpus(), and remove the unnecessary spinlock. Make the ptrace bts and the ds selftest code use the new interface. Defer the ds selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144555.658136000@intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 193 +++++++++++++++------------------------ 1 file changed, 72 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 8b2109a6c61..50565d8cd2e 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -4,7 +4,6 @@ * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 */ -#include #include #include #include @@ -21,168 +20,113 @@ #define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; -/* - * Initialize the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_init_cpu(void *arg) +static void bts_trace_init_cpu(int cpu) { - if (this_tracer) - ds_release_bts(this_tracer); + per_cpu(tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); - this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } + if (IS_ERR(per_cpu(tracer, cpu))) + per_cpu(tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) { - int cpu, avail; - - spin_lock(&bts_tracer_lock); + int cpu; hw_branch_trace = tr; + trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_init_cpu, NULL, 1); - - /* Check on how many cpus we could enable tracing */ - avail = 0; - for_each_online_cpu(cpu) - if (per_cpu(tracer, cpu)) - avail++; + get_online_cpus(); + for_each_online_cpu(cpu) { + bts_trace_init_cpu(cpu); - trace_hw_branches_enabled = (avail ? 1 : 0); + if (likely(per_cpu(tracer, cpu))) + trace_hw_branches_enabled = 1; + } trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); - + put_online_cpus(); /* If we could not enable tracing on a single cpu, we fail. */ - return avail ? 0 : -EOPNOTSUPP; -} - -/* - * Release the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_release_cpu(void *arg) -{ - if (this_tracer) { - ds_release_bts(this_tracer); - this_tracer = NULL; - } + return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; } static void bts_trace_reset(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_release_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } + } trace_hw_branches_enabled = 0; trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Resume tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_resume_cpu(void *arg) -{ - if (this_tracer) - ds_resume_bts(this_tracer); + put_online_cpus(); } static void bts_trace_start(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_resume_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Suspend tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_suspend_cpu(void *arg) -{ - if (this_tracer) - ds_suspend_bts(this_tracer); + put_online_cpus(); } static void bts_trace_stop(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 1; - - spin_unlock(&bts_tracer_lock); + put_online_cpus(); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; - - spin_lock(&bts_tracer_lock); - - if (!trace_hw_branches_enabled) - goto out; + int cpu = (long)hcpu; switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); - - if (trace_hw_branches_suspended) - smp_call_function_single(cpu, bts_trace_suspend_cpu, - NULL, 1); + /* The notification is sent with interrupts enabled. */ + if (trace_hw_branches_enabled) { + bts_trace_init_cpu(cpu); + + if (trace_hw_branches_suspended && + likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + } break; + case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); - break; + /* The notification is sent with interrupts enabled. */ + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } - out: - spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu */ static void trace_bts_cpu(void *arg) { @@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg) if (unlikely(!this_tracer)) return; - ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) - goto out; + return; for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) @@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) trace_bts_at(trace, at); - -out: - ds_resume_bts(this_tracer); } static void trace_bts_prepare(struct trace_iterator *iter) { - spin_lock(&bts_tracer_lock); + int cpu; + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + /* + * We need to collect the trace on the respective cpu since ftrace + * implicitly adds the record for the current cpu. + * Once that is more flexible, we could collect the data from any cpu. + */ on_each_cpu(trace_bts_cpu, iter->tr, 1); - spin_unlock(&bts_tracer_lock); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + put_online_cpus(); } static void trace_bts_close(struct trace_iterator *iter) @@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - spin_lock(&bts_tracer_lock); - - if (trace_hw_branches_enabled) + if (this_tracer) { + ds_suspend_bts_noirq(this_tracer); trace_bts_cpu(hw_branch_trace); - - spin_unlock(&bts_tracer_lock); + ds_resume_bts_noirq(this_tracer); + } } struct tracer bts_tracer __read_mostly = -- cgit v1.2.3 From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:41 +0200 Subject: x86, hw-branch-tracer: allocate selftest iterator on heap Allocate the trace_iterator for the hw-branch-tracer selftest on the heap. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144556.578777000@intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 499d01c44cd..00dd6485bdd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -757,7 +757,7 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - struct trace_iterator iter; + struct trace_iterator *iter; struct tracer tracer; unsigned long count; int ret; @@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace, * The hw-branch tracer needs to collect the trace from the various * cpu trace buffers - before tracing is stopped. */ - memset(&iter, 0, sizeof(iter)); + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + memcpy(&tracer, trace, sizeof(tracer)); - iter.trace = &tracer; - iter.tr = tr; - iter.pos = -1; - mutex_init(&iter.mutex); + iter->trace = &tracer; + iter->tr = tr; + iter->pos = -1; + mutex_init(&iter->mutex); - trace->open(&iter); + trace->open(iter); - mutex_destroy(&iter.mutex); + mutex_destroy(&iter->mutex); + kfree(iter); tracing_stop(); -- cgit v1.2.3 From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765b..69bde7a22e9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(current->ptrace)) - ptrace_fork(p, clone_flags); + + p->bts = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aaad0ec3419..321127d965c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -26,16 +26,6 @@ #include -/* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - arch_ptrace_fork(child, clone_flags); -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. -- cgit v1.2.3 From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 14:55:44 +0800 Subject: tracing: use macros to denote usec and nsec per second Impact: cleanup Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000. Signed-off-by: Li Zefan LKML-Reference: <49CC7870.9000309@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_boot.c | 5 +++-- kernel/trace/trace_mmiotrace.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c364..a29ef23ffb4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b..d53b45ed080 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include #include #include +#include + #include #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; -- cgit v1.2.3 From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Mar 2009 00:25:38 +0100 Subject: tracing/ftrace: factorize the tracing files creation Impact: cleanup Most of the tracing files creation follow the same pattern: ret = debugfs_create_file(...) if (!ret) pr_warning("Couldn't create ... entry\n") Unify it! Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 +++------ kernel/trace/ring_buffer.c | 7 +- kernel/trace/trace.c | 159 +++++++++++++++---------------------- kernel/trace/trace.h | 6 ++ kernel/trace/trace_event_profile.c | 1 - kernel/trace/trace_printk.c | 6 +- kernel/trace/trace_stack.c | 13 +-- kernel/trace/trace_sysprof.c | 6 +- 8 files changed, 86 insertions(+), 151 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 678e3d6caf8..6ea5a1ae6a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); ftrace_profile_debugfs(d_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c84..74a11808c28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32653c8c6e2..0615751a3ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3861,123 +3868,81 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + + trace_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); - - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); - - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); - - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); - - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); - - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 47aa6d0c97a..f76a8f8689d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -470,6 +470,12 @@ void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba997077..199de9c7422 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id) return event->profile_disable(event); } } - diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107f..9bece9687b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f966..1796f00524e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..e04b76cc238 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } -- cgit v1.2.3 From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 3 Apr 2009 15:24:12 -0400 Subject: function-graph: use int instead of atomic for ftrace_graph_active Impact: cleanup The variable ftrace_graph_active is only modified under the ftrace_lock mutex, thus an atomic is not necessary for modification. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ea5a1ae6a9..8e6a0b5c994 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); -- cgit v1.2.3 From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 31 Mar 2009 15:26:14 +0800 Subject: ftrace: clean up enable logic for sched_switch Unify sched_switch and sched_wakeup's action to following logic: Do record_cmdline when start_cmdline_record() is called. Start tracing events when the tracer is started. Signed-off-by: Zhao Lei LKML-Reference: <49D1C596.5050203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1a..9d8cccdfaa0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) return; - pc = preempt_count(); tracing_record_cmdline(current); - if (sched_stopped) + if (!tracer_enabled || sched_stopped) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; -- cgit v1.2.3 From bab5bc9e857638880facef76e4b4c3fa807f8c73 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 7 Apr 2009 23:23:50 -0700 Subject: futex: fixup unlocked requeue pi case Thomas's testing caught a problem when the requeue target futex is unowned and multiple tasks are requeued to it. This patch ensures the FUTEX_WAITERS bit gets set if futex_requeue() will requeue one or more tasks in addition to the one acquiring the lock. Signed-off-by: Darren Hart Signed-off-by: Thomas Gleixner --- kernel/futex.c | 65 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 185c981d89e..041bf3ac4be 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -565,12 +565,14 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, /** * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the lookup - * @task: the task to perform the atomic lock work for. This will be - * "current" except in the case of requeue pi. + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Returns: * 0 - ready to wait @@ -582,7 +584,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task) + struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; u32 uval, newval, curval; @@ -596,6 +598,8 @@ retry: * the locks. It will most likely not succeed. */ newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; curval = cmpxchg_futex_value_locked(uaddr, 0, newval); @@ -1004,14 +1008,18 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) /** * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. * * Returns: * 0 - failed to acquire the lock atomicly @@ -1022,15 +1030,23 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps) + struct futex_pi_state **ps, int set_waiters) { - struct futex_q *top_waiter; + struct futex_q *top_waiter = NULL; u32 curval; int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ top_waiter = futex_top_waiter(hb1, key1); /* There are no waiters, nothing for us to do. */ @@ -1038,10 +1054,12 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, return 0; /* - * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. - * The pi_state is returned in ps in contended cases. + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); if (ret == 1) requeue_pi_wake_futex(top_waiter, key2); @@ -1146,9 +1164,14 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { - /* Attempt to acquire uaddr2 and wake the top_waiter. */ + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state); + &key2, &pi_state, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -1810,7 +1833,7 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { case 1: -- cgit v1.2.3 From e30e08f65c7ef6c230424264f09c3d53f117f58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:25 +0200 Subject: perf_counter: fix NMI race in task clock We should not be updating ctx->time from NMI context, work around that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.681326666@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 863703b3158..84a39081344 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -319,8 +319,6 @@ static void __perf_counter_disable(void *info) spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); - /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. @@ -2335,13 +2333,11 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -static void task_clock_perf_counter_update(struct perf_counter *counter) +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) { - u64 prev, now; + u64 prev; s64 delta; - now = counter->ctx->time; - prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; atomic64_add(delta, &counter->count); @@ -2369,13 +2365,24 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter); + task_clock_perf_counter_update(counter, counter->ctx->time); + } static void task_clock_perf_counter_read(struct perf_counter *counter) { - update_context_time(counter->ctx); - task_clock_perf_counter_update(counter); + u64 time; + + if (!in_nmi()) { + update_context_time(counter->ctx); + time = counter->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - counter->ctx->timestamp; + time = counter->ctx->time + delta; + } + + task_clock_perf_counter_update(counter, time); } static const struct hw_perf_counter_ops perf_ops_task_clock = { -- cgit v1.2.3 From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:26 +0200 Subject: perf_counter: provide misc bits in the event header Limit the size of each record to 64k (or should we count in multiples of u64 and have a 512K limit?), this gives 16 bits or spare room in the header, which we can use for misc bits, so as to not have to grow the record with u64 every time we have a few bits to report. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.769271806@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 84a39081344..4af98f943d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter, header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); + header.misc = user_mode(regs) ? + PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); header.type |= __PERF_EVENT_IP; -- cgit v1.2.3 From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:27 +0200 Subject: perf_counter: use misc field to widen type Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that we can have the full 32bit for PERF_RECORD_ bits. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.891867663@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4af98f943d3..bf12df6f353 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter, int callchain_size = 0; u64 time; - header.type = PERF_EVENT_COUNTER_OVERFLOW; + header.type = 0; header.size = sizeof(header); - header.misc = user_mode(regs) ? + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= user_mode(regs) ? PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; + header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } @@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter, tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; - header.type |= __PERF_EVENT_TID; + header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); } if (record_type & PERF_RECORD_GROUP) { - header.type |= __PERF_EVENT_GROUP; + header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } @@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - header.type |= __PERF_EVENT_CALLCHAIN; + header.type |= PERF_RECORD_CALLCHAIN; header.size += callchain_size; } } @@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter, */ time = sched_clock(); - header.type |= __PERF_EVENT_TIME; + header.type |= PERF_RECORD_TIME; header.size += sizeof(u64); } -- cgit v1.2.3 From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:30 +0200 Subject: perf_counter: track task-comm data Similar to the mmap data stream, add one that tracks the task COMM field, so that the userspace reporting knows what to call a task. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.127422406@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index bf12df6f353..2d4aebb2982 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1916,6 +1916,99 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_end(&handle); } +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + if (counter->hw_event.comm && + comm_event->event.header.type == PERF_EVENT_COMM) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter, comm_event)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm), sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event = { + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + .pid = task->group_leader->pid, + .tid = task->pid, + }, + }; + + perf_counter_comm_event(&comm_event); +} + /* * mmap tracking */ -- cgit v1.2.3 From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:32 +0200 Subject: perf_counter: move PERF_RECORD_TIME Move PERF_RECORD_TIME so that all the fixed length items come before the variable length ones. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.307926436@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2d4aebb2982..4dc8600d282 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_RECORD_TIME; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter, } } - if (record_type & PERF_RECORD_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.type |= PERF_RECORD_TIME; - header.size += sizeof(u64); - } - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TID) perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); - if (record_type & PERF_RECORD_TIME) - perf_output_put(&handle, time); - perf_output_end(&handle); } -- cgit v1.2.3 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4dc8600d282..321c57e3556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) update_context_time(ctx); regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle) } static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int ret; u64 record_type = counter->hw_event.record_type; @@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_ADDR) { + header.type |= PERF_RECORD_ADDR; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TIME) perf_output_put(&handle, time); + if (record_type & PERF_RECORD_ADDR) + perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, */ int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); int ret = 0; @@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs); + perf_counter_output(counter, nmi, regs, addr); return ret; } @@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_counter_overflow(counter, 0, regs)) + if (perf_counter_overflow(counter, 0, regs, 0)) ret = HRTIMER_NORESTART; } @@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, addr)) /* soft-disable the counter */ ; @@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); if (counter->hw.irq_period && !neg) - perf_swcounter_overflow(counter, nmi, regs); + perf_swcounter_overflow(counter, nmi, regs, addr); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_counter *counter; @@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs); + perf_swcounter_add(counter, nr, nmi, regs, addr); } rcu_read_unlock(); } @@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) } static void __perf_swcounter_event(enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); if (cpuctx->task_ctx) { perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs); + nr, nmi, regs, addr); } barrier(); @@ -2349,9 +2360,10 @@ out: put_cpu_var(perf_cpu_context); } -void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); } static void perf_swcounter_read(struct perf_counter *counter) @@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id) if (!regs) regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } extern int ftrace_profile_enable(int); -- cgit v1.2.3 From 002f128b473fb82f454654be5081b0919ee01ab2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 8 Apr 2009 15:29:43 -0700 Subject: sched: remove redundant hierarchy walk in check_preempt_wakeup Impact: micro-optimization Under group scheduling we traverse up until we are at common siblings to make the wakeup comparison on. At this point however, they should have the same parent so continuing to check up the tree is redundant. Signed-off-by: Paul Turner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3 From 888fcee066a2f4abd0d0bc9418c0535f9b01e6e5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Apr 2009 09:48:22 +0200 Subject: perf_counter: fix off task->comm by one strlen() does not include the \0. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 321c57e3556..b07195bbd22 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1989,7 +1989,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) unsigned int size; char *comm = comm_event->task->comm; - size = ALIGN(strlen(comm), sizeof(u64)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; @@ -2109,7 +2109,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) } got_name: - size = ALIGN(strlen(name), sizeof(u64)); + size = ALIGN(strlen(name)+1, sizeof(u64)); mmap_event->file_name = name; mmap_event->file_size = size; -- cgit v1.2.3 From 9ee318a7825929bc3734110b83ae8e20e53d9de3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:44 +0200 Subject: perf_counter: optimize mmap/comm tracking Impact: performance optimization The mmap/comm tracking code does quite a lot of work before it discovers there's no interest in it, avoid that by keeping a counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.427173196@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b07195bbd22..76376ecb23b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -38,6 +38,10 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_mmap_tracking __read_mostly; +static atomic_t nr_munmap_tracking __read_mostly; +static atomic_t nr_comm_tracking __read_mostly; + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -1186,6 +1190,13 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); + if (counter->hw_event.mmap) + atomic_dec(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_dec(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_dec(&nr_comm_tracking); + if (counter->destroy) counter->destroy(counter); @@ -2005,7 +2016,12 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) void perf_counter_comm(struct task_struct *task) { - struct perf_comm_event comm_event = { + struct perf_comm_event comm_event; + + if (!atomic_read(&nr_comm_tracking)) + return; + + comm_event = (struct perf_comm_event){ .task = task, .event = { .header = { .type = PERF_EVENT_COMM, }, @@ -2128,7 +2144,12 @@ got_name: void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { - struct perf_mmap_event mmap_event = { + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ .file = file, .event = { .header = { .type = PERF_EVENT_MMAP, }, @@ -2146,7 +2167,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { - struct perf_mmap_event mmap_event = { + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_munmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ .file = file, .event = { .header = { .type = PERF_EVENT_MUNMAP, }, @@ -2725,6 +2751,13 @@ done: counter->hw_ops = hw_ops; + if (counter->hw_event.mmap) + atomic_inc(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_inc(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_inc(&nr_comm_tracking); + return counter; } -- cgit v1.2.3 From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:45 +0200 Subject: perf_counter: sysctl for system wide perf counters Impact: add sysctl for paranoid/relaxed perfcounters policy Allow the use of system wide perf counters to everybody, but provide a sysctl to disable it for the paranoid security minded. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.514046352@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 +++- kernel/sysctl.c | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 76376ecb23b..7efb7ebaaae 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly; static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; +int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (!capable(CAP_SYS_ADMIN)) + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4286b62b34a..8ba457838d9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_PERF_COUNTERS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_privileged", + .data = &sysctl_perf_counter_priv, + .maxlen = sizeof(sysctl_perf_counter_priv), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From d3d21c412d8525eb2e208d990ab5eee5fb0fe03d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:46 +0200 Subject: perf_counter: log full path names Impact: fix perf-report output for /home mounted binaries, etc. dentry_path() only provide path-names up to the mount root, which is unsuited for out purpose, use d_path() instead. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.601794134@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7efb7ebaaae..7f9521c3c01 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2116,7 +2116,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; } - name = dentry_path(file->f_dentry, buf, PATH_MAX); + name = d_path(&file->f_path, buf, PATH_MAX); if (IS_ERR(name)) { name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; -- cgit v1.2.3 From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Apr 2009 01:49:33 +0200 Subject: tracing/lockdep: report the time waited for a lock While trying to optimize the new lock on reiserfs to replace the bkl, I find the lock tracing very useful though it lacks something important for performance (and latency) instrumentation: the time a task waits for a lock. That's what this patch implements: bash-4816 [000] 202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key bash-4816 [000] 202.652819: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652825: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652829: lock_acquired: &rq->lock (0.000 us) bash-4816 [000] 202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us) As shown above, the "lock acquired" field is followed by the time it has been waiting for the lock. Usually, a lock contended entry is followed by a near lock_acquired entry with a non-zero time waited. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f01186696..c4582a6ea95 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3061,6 +3061,8 @@ found_it: put_lock_stats(stats); } +DEFINE_TRACE(lock_acquired); + static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3099,6 +3101,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; -- cgit v1.2.3 From a5a2a0c7fa039c59619bc908b3b1ed24734d442a Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 10 Apr 2009 09:50:05 -0700 Subject: futex: fix futex_wait_setup key handling If the get_futex_key() call were to fail, the existing code would try and put_futex_key() prior to returning. This patch makes sure we only put_futex_key() if get_futex_key() succeeded. Reported-by: Clark Williams Signed-off-by: Darren Hart LKML-Reference: <20090410165005.14342.16973.stgit@Aeon> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 041bf3ac4be..6d2daa46f9f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1668,7 +1668,7 @@ retry: q->key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) - goto out; + return ret; retry_private: *hb = queue_lock(q); -- cgit v1.2.3 From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:26:18 +0800 Subject: tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace part and tracepoint part Impact: refactor code for future changes Current kmemtrace.h is used both as header file of kmemtrace and kmem's tracepoints definition. Tracepoints' definition file may be used by other code, and should only have definition of tracepoint. We can separate include/trace/kmemtrace.h into 2 files: include/linux/kmemtrace.h: header file for kmemtrace include/trace/kmem.h: definition of kmem tracepoints Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 2 +- kernel/trace/trace.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e3..7a0aa0e260d 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace_output.h" #include "trace.h" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f76a8f8689d..34b94c3f40a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include enum trace_type { -- cgit v1.2.3 From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:18:53 +0800 Subject: blktrace: fix output of unknown events Not all events are pc (packet command) events. An event is a pc event only if it has BLK_TC_PC bit set. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D3236D.3090705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..e45e1af1356 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) -- cgit v1.2.3 From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:19:19 +0800 Subject: blktrace: fix output of BLK_TC_PC events BLK_TC_PC events should be treated differently with BLK_TC_FS events. Before this patch: # echo 1 > /sys/block/sda/sda1/trace/enable # echo pc > /sys/block/sda/sda1/trace/act_mask # echo blk > /debugfs/tracing/current_tracer # (generate some BLK_TC_PC events) # cat trace bash-2184 [000] 1774.275413: 8,7 I N [bash] bash-2184 [000] 1774.275435: 8,7 D N [bash] bash-2184 [000] 1774.275540: 8,7 I R [bash] bash-2184 [000] 1774.275547: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275580: 8,7 C N 0 [0] bash-2184 [000] 1774.275648: 8,7 I R [bash] bash-2184 [000] 1774.275653: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275682: 8,7 C N 0 [0] bash-2184 [000] 1774.275739: 8,7 I R [bash] bash-2184 [000] 1774.275744: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275771: 8,7 C N 0 [0] bash-2184 [000] 1774.275804: 8,7 I R [bash] bash-2184 [000] 1774.275808: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275836: 8,7 C N 0 [0] After this patch: # cat trace bash-2263 [000] 366.782149: 8,7 I N 0 (00 ..) [bash] bash-2263 [000] 366.782323: 8,7 D N 0 (00 ..) [bash] bash-2263 [000] 366.782557: 8,7 I R 8 (25 00 ..) [bash] bash-2263 [000] 366.782560: 8,7 D R 8 (25 00 ..) [bash] ksoftirqd/0-4 [000] 366.782582: 8,7 C N (25 00 ..) [0] bash-2263 [000] 366.782648: 8,7 I R 8 (5a 00 3f 00) [bash] bash-2263 [000] 366.782650: 8,7 D R 8 (5a 00 3f 00) [bash] ksoftirqd/0-4 [000] 366.782669: 8,7 C N (5a 00 3f 00) [0] bash-2263 [000] 366.782710: 8,7 I R 8 (5a 00 08 00) [bash] bash-2263 [000] 366.782713: 8,7 D R 8 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.782730: 8,7 C N (5a 00 08 00) [0] bash-2263 [000] 366.783375: 8,7 I R 36 (5a 00 08 00) [bash] bash-2263 [000] 366.783379: 8,7 D R 36 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.783404: 8,7 C N (5a 00 08 00) [0] This is what we do with PC events in user-space blktrace. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D32387.9040106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 88 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e45e1af1356..2b98195b338 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) -- cgit v1.2.3 From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:48:49 -0500 Subject: tracing/filters: add run-time field descriptions to TRACE_EVENT_FORMAT events This patch adds run-time field descriptions to all the event formats exported using TRACE_EVENT_FORMAT. It also hooks up all the tracers that use them (i.e. the tracers in the 'ftrace subsystem') so they can also have their output filtered by the event-filtering mechanism. When I was testing this, there were a couple of things that fooled me into thinking the filters weren't working, when actually they were - I'll mention them here so others don't make the same mistakes (and file bug reports. ;-) One is that some of the tracers trace multiple events e.g. the sched_switch tracer uses the context_switch and wakeup events, and if you don't set filters on all of the traced events, the unfiltered output from the events without filters on them can make it look like the filtering as a whole isn't working properly, when actually it is doing what it was asked to do - it just wasn't asked to do the right thing. The other is that for the really high-volume tracers e.g. the function tracer, the volume of filtered events can be so high that it pushes the unfiltered events out of the ring buffer before they can be read so e.g. cat'ing the trace file repeatedly shows either no output, or once in awhile some output but that isn't there the next time you read the trace, which isn't what you normally expect when reading the trace file. If you read from the trace_pipe file though, you can catch them before they disappear. Changes from v1: As suggested by Frederic Weisbecker: - get rid of externs in functions - added unlikely() to filter_check_discard() Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 6 ++++ kernel/trace/trace.c | 25 ++++++++++++++++ kernel/trace/trace.h | 20 +++++++++++++ kernel/trace/trace_branch.c | 3 ++ kernel/trace/trace_event_types.h | 6 ++-- kernel/trace/trace_events.c | 7 +++++ kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_events_stage_2.h | 7 ----- kernel/trace/trace_export.c | 57 +++++++++++++++++++++++++++++++++++-- kernel/trace/trace_hw_branches.c | 2 ++ kernel/trace/trace_power.c | 4 +++ 11 files changed, 127 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 7a0aa0e260d..9419ad10541 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4865459f609..962e6179994 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -898,6 +898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -912,6 +913,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; + + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); } @@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); return 1; @@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } @@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); + + filter_check_discard(call, entry, event); + trace_buffer_unlock_commit(tr, event, flags, pc); } @@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); @@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: @@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 34b94c3f40a..e7737281953 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +static inline void +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + if (unlikely(call->preds) && !filter_match_preds(call, rec)) + ring_buffer_event_discard(event); +} + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -897,4 +912,9 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#include "trace_event_types.h" + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e6e32912ffb..c95c25d838e 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); out: diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd..95b147aac22 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ff..be9299a53e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,6 +680,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f35..470ad9487ec 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) } events_for_each(call) { - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (!strcmp(call->system, system->name)) @@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, events_for_each(call) { int err; - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (strcmp(call->system, system->name)) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d411..1c94b87c718 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf..f4e46616c48 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -30,7 +30,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ } + #include "trace_event_types.h" diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..e6b275b22ac 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bae791ebcc5..8ce7d7d62c0 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); -- cgit v1.2.3 From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:49:16 -0500 Subject: tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro Frederic Weisbecker suggested that the trace_special event shouldn't be filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro that allows an event format to be exported without having a filter attached, and removes filtering from the trace_special event. Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 2 ++ kernel/trace/trace_event_types.h | 2 +- kernel/trace/trace_export.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 962e6179994..c209d214169 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { - struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7737281953..3cf856fa597 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -915,6 +915,8 @@ do { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) #include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 95b147aac22..cfcecc4fd86 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f4e46616c48..77c494f5e1d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void) \ return 0; \ } \ +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .show_format = ftrace_format_##call, \ +}; + #include "trace_event_types.h" #undef TRACE_FIELD @@ -150,4 +179,8 @@ ftrace_define_fields_##call(void) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" -- cgit v1.2.3 From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 00:09:41 -0400 Subject: ring-buffer: add ring_buffer_discard_commit The ring_buffer_discard_commit is similar to ring_buffer_event_discard but it can only be done on an event that has yet to be commited. Unpredictable results can happen otherwise. The main difference between ring_buffer_discard_commit and ring_buffer_event_discard is that ring_buffer_discard_commit will try to free the data in the ring buffer if nothing has addded data after the reserved event. If something did, then it acts almost the same as ring_buffer_event_discard followed by a ring_buffer_unlock_commit. Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit can be called on an event, not both. This commit also exports both discard functions to be usable by GPL modules. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 125 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 104 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 74a11808c28..f935bd5ec3e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event) event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { @@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + int cpu; + + /* The event is discarded regardless */ + ring_buffer_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !preempt_count()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + goto out; + } + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + cpu_buffer->entries++; + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. -- cgit v1.2.3 From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 01:16:59 -0400 Subject: tracing/filters: use ring_buffer_discard_commit for discarded events The ring_buffer_discard_commit makes better usage of the ring_buffer when an event has been discarded. It tries to remove it completely if possible. This patch converts the trace event filtering to use ring_buffer_discard_commit instead of the ring_buffer_event_discard. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 +++++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events_stage_3.h | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c209d214169..d880ab2772c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3cf856fa597..dfefffd7ae3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9d2fa78cecc..d2f34bf30e5 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + trace_current_buffer_discard_commit(event); \ + else \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ } \ \ -- cgit v1.2.3 From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:14:01 -0500 Subject: tracing/infrastructure: separate event tracer from event support Add a new config option, CONFIG_EVENT_TRACING that gets selected when CONFIG_TRACING is selected and adds everything needed by the stuff in trace_export - basically all the event tracing support needed by e.g. bprint, minus the actual events, which are only included if CONFIG_EVENT_TRACER is selected. So CONFIG_EVENT_TRACER can be used to turn on or off the generated events (what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns on or off the base event tracing support used by both the event tracer and the other things such as bprint that can't be configured out. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178441.10295.34.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++++ kernel/trace/Makefile | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 23b96ebbf89..644606e899f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + bool + config TRACING bool select DEBUG_FS @@ -56,6 +59,7 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING # # Minimum requirements an architecture has to meet for us to diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..3ad367e7c97 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o -- cgit v1.2.3 From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:15:54 -0500 Subject: tracing/filters: use ring_buffer_discard_commit() in filter_check_discard() This patch changes filter_check_discard() to make use of the new ring_buffer_discard_commit() function and modifies the current users to call the old commit function in the non-discard case. It also introduces a version of filter_check_discard() that uses the global trace buffer (filter_current_check_discard()) for those cases. v2 changes: - fix compile error noticed by Ingo Molnar Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178554.10295.36.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 10 ++++----- kernel/trace/trace.c | 45 ++++++++++++++++++++----------------- kernel/trace/trace.h | 14 +++++++++--- kernel/trace/trace_branch.c | 5 ++--- kernel/trace/trace_events_stage_3.h | 5 +---- kernel/trace/trace_hw_branches.c | 4 ++-- kernel/trace/trace_power.c | 8 +++---- 7 files changed, 48 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 9419ad10541..86cdf671d7e 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d880ab2772c..c0047fcf707 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,12 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} + cycle_t ftrace_now(int cpu) { u64 ts; @@ -919,9 +925,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - filter_check_discard(call, entry, event); - - trace_buffer_unlock_commit(tr, event, flags, pc); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfefffd7ae3..9729d14767d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); -static inline void +static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) - ring_buffer_event_discard(event); + if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } #define __common_field(type, item) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c95c25d838e..8e64e604f5a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index d2f34bf30e5..b2b298269eb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (call->preds && !filter_match_preds(call, entry)) \ - trace_current_buffer_discard_commit(event); \ - else \ + if (!filter_current_check_discard(call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ } \ \ static int ftrace_raw_reg_event_##call(void) \ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e6b275b22ac..8683d50a753 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8ce7d7d62c0..810a5b7cf1c 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } -- cgit v1.2.3 From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 13 Apr 2009 03:17:50 -0500 Subject: tracing/filters: allow on-the-fly filter switching This patch allows event filters to be safely removed or switched on-the-fly while avoiding the use of rcu or the suspension of tracing of previous versions. It does it by adding a new filter_pred_none() predicate function which does nothing and by never deallocating either the predicates or any of the filter_pred members used in matching; the predicate lists are allocated and initialized during ftrace_event_calls initialization. Whenever a filter is removed or replaced, the filter_pred_* functions currently in use by the affected ftrace_event_call are immediately switched over to to the filter_pred_none() function, while the rest of the filter_pred members are left intact, allowing any currently executing filter_pred_* functions to finish up, using the values they're currently using. In the case of filter replacement, the new predicate values are copied into the old predicates after the above step, and the filter_pred_none() functions are replaced by the filter_pred_* functions for the new filter. In this case, it is possible though very unlikely that a previous filter_pred_* is still running even after the filter_pred_none() switch and the switch to the new filter_pred_*. In that case, however, because nothing has been deallocated in the filter_pred, the worst that can happen is that the old filter_pred_* function sees the new values and as a result produces either a false positive or a false negative, depending on the values it finds. So one downside to this method is that rarely, it can produce a bad match during the filter switch, but it should be possible to live with that, IMHO. The other downside is that at least in this patch the predicate lists are always pre-allocated, taking up memory from the start. They could probably be allocated on first-use, and de-allocated when tracing is completely stopped - if this patch makes sense, I could create another one to do that later on. Oh, and it also places a restriction on the size of __arrays in events, currently set to 128, since they can't be larger than the now embedded str_val arrays in the filter_pred struct. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239610670.6660.49.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 14 +- kernel/trace/trace_events.c | 9 +- kernel/trace/trace_events_filter.c | 252 +++++++++++++++++++----------------- kernel/trace/trace_events_stage_2.h | 1 + kernel/trace/trace_events_stage_3.h | 1 + kernel/trace/trace_export.c | 1 + 6 files changed, 150 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9729d14767d..b05b6ac982a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -813,6 +813,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + int n_preds; struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE @@ -826,6 +827,7 @@ struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; + int n_preds; struct filter_pred **preds; }; @@ -834,7 +836,8 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 struct filter_pred; @@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; @@ -855,13 +858,14 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); +extern void filter_disable_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, @@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 789e14eb09a..ead68ac9919 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + filter_print_preds(call->preds, call->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (pred->clear) { - filter_free_preds(call); + filter_disable_preds(call); filter_free_pred(pred); return cnt; } @@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return err; } + filter_free_pred(pred); + *ppos += cnt; return cnt; @@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + filter_print_preds(system->preds, system->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); system->preds = NULL; + system->n_preds = 0; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f8ecca34a5..de42dad42a8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; + for (i = 0; i < call->n_preds; i++) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; } if (and_failed) @@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; int i; - if (!preds) { + if (!n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; + for (i = 0; i < n_preds; i++) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); + if (pred->str_len) + trace_seq_printf(s, "%s\n", pred->str_val); + else + trace_seq_printf(s, "%llu\n", pred->val); } } @@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) +{ + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} + +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + dest->fn = fn; + + return 0; +} + +void filter_disable_preds(struct ftrace_event_call *call) { int i; - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + call->n_preds = 0; + + for (i = 0; i < MAX_FILTER_PRED; i++) + call->preds[i]->fn = filter_pred_none; +} + +int init_preds(struct ftrace_event_call *call) +{ + struct filter_pred *pred; + int i; + + call->n_preds = 0; + + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + call->preds[i] = pred; + } + + return 0; + +oom: + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; } + kfree(call->preds); + call->preds = NULL; + + return -ENOMEM; } void filter_free_subsystem_preds(struct event_subsystem *system) @@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call = __start_ftrace_events; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + if (system->n_preds) { + for (i = 0; i < system->n_preds; i++) filter_free_pred(system->preds[i]); kfree(system->preds); system->preds = NULL; + system->n_preds = 0; } events_for_each(call) { @@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_free_preds(call); + filter_disable_preds(call); } } static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + filter_pred_fn_t fn) { - int i; + int idx, err; - if (call->preds && !pred->compound) - filter_free_preds(call); + if (call->n_preds && !pred->compound) + filter_disable_preds(call); - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + if (call->n_preds == MAX_FILTER_PRED) + return -ENOSPC; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; - } - } + idx = call->n_preds; + filter_clear_pred(call->preds[idx]); + err = filter_set_pred(call->preds[idx], pred, fn); + if (err) + return err; + + call->n_preds++; - return -ENOSPC; + return 0; } static int is_string_field(const char *type) @@ -229,98 +277,66 @@ static int is_string_field(const char *type) int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { struct ftrace_event_field *field; + filter_pred_fn_t fn; field = find_event_field(call, pred->field_name); if (!field) return -EINVAL; + pred->fn = filter_pred_none; pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_val) + if (!pred->str_len) return -EINVAL; - pred->fn = filter_pred_string; + fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred); + return __filter_add_pred(call, pred, fn); } else { - if (pred->str_val) + if (pred->str_len) return -EINVAL; } switch (field->size) { case 8: - pred->fn = filter_pred_64; + fn = filter_pred_64; break; case 4: - pred->fn = filter_pred_32; + fn = filter_pred_32; break; case 2: - pred->fn = filter_pred_16; + fn = filter_pred_16; break; case 1: - pred->fn = filter_pred_8; + fn = filter_pred_8; break; default: return -EINVAL; } - return __filter_add_pred(call, pred); -} - -static struct filter_pred *copy_pred(struct filter_pred *pred) -{ - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; - - memcpy(new_pred, pred, sizeof(*pred)); - - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } - } - - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; - } - } - - return new_pred; + return __filter_add_pred(call, pred, fn); } int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; - if (system->preds && !pred->compound) + if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); - if (!system->preds) { + if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!system->preds) return -ENOMEM; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; - } - } - - if (i == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) return -ENOSPC; + system->preds[system->n_preds] = pred; + events_for_each(call) { int err; @@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (!find_event_field(call, pred->field_name)) continue; - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; - - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + err = filter_add_pred(call, pred); + if (err == -ENOMEM) { + system->preds[system->n_preds] = NULL; + return err; + } } - return 0; + system->n_preds++; -oom: - system->preds[i] = NULL; - return -ENOMEM; + return 0; } int filter_parse(char **pbuf, struct filter_pred *pred) @@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } - if (!val_str) { + if (!val_str || !strlen(val_str) + || strlen(val_str) >= MAX_FILTER_STR_VAL) { pred->field_name = NULL; return -EINVAL; } @@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred) if (!pred->field_name) return -ENOMEM; + pred->str_len = 0; pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; + strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); + pred->str_len = strlen(val_str); + pred->str_val[pred->str_len] = '\0'; } else if (*tmp != '\0') return -EINVAL; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 02fb710193e..59cfd7dfe68 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __array #define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item)); \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index b2b298269eb..5bb1b7ffbdb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77c494f5e1d..48fc02fe73a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ -- cgit v1.2.3 From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:17 -0700 Subject: rcu: Add __rcu_pending tracing to hierarchical RCU Add tracing to __rcu_pending() to provide information on why RCU processing was kicked off. This is helpful for debugging hierarchical RCU, and might also be helpful in learning how hierarchical RCU operates. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <1239683479943-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 25 +++++++++++++++----- kernel/rcutree_trace.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372fb0b9..0dccfbba6d2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) + if (rdp->qs_pending) { + rdp->n_rp_qs_pending++; return 1; + } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; return 1; + } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; return 1; + } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; return 1; + } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; return 1; + } /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + rdp->n_rp_need_fqs++; return 1; + } /* nothing to do */ + rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875ba940..fe1dcdbf1ca 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = { .release = single_release, }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = rsp->rda[cpu]; + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; + static int __init rcuclassic_trace_init(void) { rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void) NULL, &rcuhier_fops); if (!hierdir) goto free_out; + + rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!rcu_pendingdir) + goto free_out; return 0; free_out: if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void) debugfs_remove(datadir_csv); debugfs_remove(gpdir); debugfs_remove(hierdir); + debugfs_remove(rcu_pendingdir); debugfs_remove(rcudir); } -- cgit v1.2.3 From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:30 +0530 Subject: sched: Nominate idle load balancer from a semi-idle package. Currently the nomination of idle-load balancer is done by choosing the first idle cpu in the nohz.cpu_mask. This may not be power-efficient, since such an idle cpu could come from a completely idle core/package thereby preventing the whole core/package from being in a low-power state. For eg, consider a quad-core dual package system. The cpu numbering need not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7]. With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep as fewer Packages/Cores active. But the current idle load balancer logic goes against this by choosing the first_cpu in the nohz.cpu_mask and not taking the system topology into consideration. Improve the algorithm to nominate the idle load balancer from a semi idle cores/packages thereby increasing the probability of the cores/packages being in deeper sleep states for longer duration. The algorithm is activated only when sched_mc/smt_power_savings != 0. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..b0fefa300b4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return first_cpu(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -9051,6 +9159,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ -- cgit v1.2.3 From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:35 +0530 Subject: sched: Nominate a power-efficient ilb in select_nohz_balancer() The CPU that first goes idle becomes the idle-load-balancer and remains that until either it picks up a task or till all the CPUs of the system goes idle. Optimize this further to allow it to relinquish it's post once all it's siblings in the power-aware sched_domain go idle, thereby allowing the whole package-core to go idle. While relinquising the post, nominate another an idle-load balancer from a semi-idle core/package. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b0fefa300b4..36d213bca47 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; -- cgit v1.2.3 From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 08:54:16 -0400 Subject: tracing: consolidate trace and trace_event headers Impact: clean up Neil Horman (et. al.) criticized the way the trace events were broken up into two files. The reason for that was that ftrace needed to separate out the declarations from where the #include was used. It then dawned on me that the tracepoint.h header only needs to define the TRACE_EVENT macro if it is not already defined. The solution is simply to test if TRACE_EVENT is defined, and if it is not then the linux/tracepoint.h header can define it. This change consolidates all the .h and _event_types.h into the .h file. Reported-by: Neil Horman Reported-by: Theodore Tso Reported-by: Jiaying Zhang Cc: Zhaolei Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Jason Baron Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/trace/events.c | 1 + kernel/trace/trace_events_stage_1.h | 4 ++-- kernel/trace/trace_events_stage_2.h | 8 ++++---- kernel/trace/trace_events_stage_3.h | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 246f2aa6dc4..5a35a914f0e 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -8,6 +8,7 @@ #include "trace_output.h" +#define TRACE_HEADER_MULTI_READ #include "trace_events_stage_1.h" #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 38985f9b379..475f46a047a 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -1,7 +1,7 @@ /* * Stage 1 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * struct ftrace_raw_ { * struct trace_entry ent; @@ -36,4 +36,4 @@ }; \ static struct ftrace_event_call event_##name -#include +#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 59cfd7dfe68..aa4a67a0656 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -1,7 +1,7 @@ /* * Stage 2 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * enum print_line_t * ftrace_raw_output_(struct trace_iterator *iter, int flags) @@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return TRACE_TYPE_HANDLED; \ } -#include +#include /* * Setup the showing format of trace point. @@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } -#include +#include #undef __field #define __field(type, item) \ @@ -167,4 +167,4 @@ ftrace_define_fields_##call(void) \ return ret; \ } -#include +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 5bb1b7ffbdb..45c04e1f38d 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -1,7 +1,7 @@ /* * Stage 3 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * static void ftrace_event_(proto) * { @@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT -- cgit v1.2.3 From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 Apr 2009 16:53:05 +0200 Subject: wait: don't use __wake_up_common() '777c6c5 wait: prevent exclusive waiter starvation' made __wake_up_common() global to be used from abort_exclusive_wait(). It was needed to do a wake-up with the waitqueue lock held while passing down a key to the wake-up function. Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()' there is an appropriate wrapper for this case: __wake_up_locked_key(). Use it here and make __wake_up_common() private to the scheduler again. Signed-off-by: Johannes Weiner Cc: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/wait.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 36d213bca47..92b4b56ad09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); -- cgit v1.2.3 From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Apr 2009 11:24:36 +0200 Subject: tracing: make the trace clocks available generally Jeremy Fitzhardinge reported this build failure: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `ds_take_timestamp': git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y. Expose the trace clock code to CONFIG_X86_DS as well. [ Unfortunately librarizing doesnt work well - ancient architectures with no raw_local_irq_save() primitive break the build. ] Reported-by: Jeremy Fitzhardinge LKML-Reference: <49E4413F.7070700@goop.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/Makefile | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e..c8e1be5f0b0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..ecc671e9f14 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o -- cgit v1.2.3 From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/exit.c | 4 ---- kernel/fork.c | 2 -- kernel/irq/handle.c | 7 +++---- kernel/kthread.c | 3 --- kernel/lockdep.c | 12 +++--------- kernel/sched.c | 10 +++------- kernel/signal.c | 2 -- kernel/softirq.c | 3 --- 8 files changed, 9 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..2fe9d2c7eee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,10 +56,6 @@ #include #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..4bebf263923 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..983d8be8dff 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,9 +17,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + #include "internals.h" /* @@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519ab..e1c76924545 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c4582a6ea95..257f21a76c5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include #include #include -#include #include #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3061,8 +3059,6 @@ found_it: put_lock_stats(stats); } -DEFINE_TRACE(lock_acquired); - static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3118,8 +3114,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..e6d4518d47e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..1d5703ff003 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2fecefacdc5..a2d9b458ac2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; -- cgit v1.2.3 From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 11 Apr 2009 12:59:57 -0400 Subject: tracing: make trace_seq operations available for core kernel In the process to make TRACE_EVENT macro work for modules, the trace_seq operations must be available for core kernel code. These operations are quite useful and can be used for other implementations. The main idea is that we create a trace_seq handle that acts very much like the seq_file handle. struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL); trace_seq_init(s); trace_seq_printf(s, "some data %d\n", variable); printk("%s", s->buffer); The main use is to allow a top level function call several other functions that may store printf like data into the buffer. Then at the end, the top level function can process all the data with any method it would like to. It could be passed to userspace, output via printk or even use seq_file: trace_seq_to_user(s, ubuf, cnt); seq_puts(m, s->buffer); Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 15 ++------------- kernel/trace/trace_output.h | 16 +--------------- 2 files changed, 3 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b05b6ac982a..1882846b738 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -12,6 +12,8 @@ #include #include +#include + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -423,19 +425,6 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 91630217fb4..5c7cbfb65c7 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,6 +1,7 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include #include "trace.h" typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, @@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); - -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, -- cgit v1.2.3 From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 11:20:49 -0400 Subject: tracing/events: move declarations from trace directory to core include In preparation to allowing trace events to happen in modules, we need to move some of the local declarations in the kernel/trace directory into include/linux. This patch simply moves the declarations and performs no context changes. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 120 +------------------------------------------- kernel/trace/trace_output.h | 14 ------ 2 files changed, 1 insertion(+), 133 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1882846b738..6bcdf4af9b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -43,20 +44,6 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - /* * Function trace entry - function address and parent function addres: */ @@ -265,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -428,31 +404,6 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); @@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -790,28 +731,6 @@ struct ftrace_event_field { int size; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; - int n_preds; - struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif -}; - struct event_subsystem { struct list_head list; const char *name; @@ -825,9 +744,6 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 -#define MAX_FILTER_STR_VAL 128 - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -845,9 +761,6 @@ struct filter_pred { int clear; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); @@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); -extern int filter_current_check_discard(struct ftrace_event_call *call, - void *rec, - struct ring_buffer_event *event); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[]; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 5c7cbfb65c7..6e220a8e570 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -4,18 +4,6 @@ #include #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t @@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); -- cgit v1.2.3 From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 12:25:37 -0400 Subject: tracing/events: move the ftrace event tracing code to core This patch moves the ftrace creation into include/trace/ftrace.h and simplifies the work of developers in adding new tracepoints. Just the act of creating the trace points in include/trace and including define_trace.h will create the events in the debugfs/tracing/events directory. This patch removes the need of include/trace/trace_events.h Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 1 - kernel/trace/events.c | 15 -- kernel/trace/trace_events_stage_1.h | 39 ----- kernel/trace/trace_events_stage_2.h | 170 ---------------------- kernel/trace/trace_events_stage_3.h | 279 ------------------------------------ 5 files changed, 504 deletions(-) delete mode 100644 kernel/trace/events.c delete mode 100644 kernel/trace/trace_events_stage_1.h delete mode 100644 kernel/trace/trace_events_stage_2.h delete mode 100644 kernel/trace/trace_events_stage_3.h (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3ad367e7c97..fb9d7f96489 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 5a35a914f0e..00000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include - -#include - -#include "trace_output.h" - -#define TRACE_HEADER_MULTI_READ -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 475f46a047a..00000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in to include the following: - * - * struct ftrace_raw_ { - * struct trace_entry ent; - * ; - * []; - * [...] - * }; - * - * The is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index aa4a67a0656..00000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in to include the following: - * - * enum print_line_t - * ftrace_raw_output_(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_ *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h deleted file mode 100644 index 45c04e1f38d..00000000000 --- a/kernel/trace/trace_events_stage_3.h +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Stage 3 of the trace events. - * - * Override the macros in to include the following: - * - * static void ftrace_event_(proto) - * { - * event_trace_printk(_RET_IP_, ": " ); - * } - * - * static int ftrace_reg_event_(void) - * { - * int ret; - * - * ret = register_trace_(ftrace_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; - * } - * - * static void ftrace_unreg_event_(void) - * { - * unregister_trace_(ftrace_event_); - * } - * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * } - * - * - * For those macros defined with TRACE_EVENT: - * - * static struct ftrace_event_call event_; - * - * static void ftrace_raw_event_(proto) - * { - * struct ring_buffer_event *event; - * struct ftrace_raw_ *entry; <-- defined in stage 1 - * unsigned long irq_flags; - * int pc; - * - * local_save_flags(irq_flags); - * pc = preempt_count(); - * - * event = trace_current_buffer_lock_reserve(event_.id, - * sizeof(struct ftrace_raw_), - * irq_flags, pc); - * if (!event) - * return; - * entry = ring_buffer_event_data(event); - * - * ; <-- Here we assign the entries by the __field and - * __array macros. - * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); - * } - * - * static int ftrace_raw_reg_event_(void) - * { - * int ret; - * - * ret = register_trace_(ftrace_raw_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; - * } - * - * static void ftrace_unreg_event_(void) - * { - * unregister_trace_(ftrace_raw_event_); - * } - * - * static struct trace_event ftrace_event_type_ = { - * .trace = ftrace_raw_output_, <-- stage 2 - * }; - * - * static int ftrace_raw_init_event_(void) - * { - * int id; - * - * id = register_ftrace_event(&ftrace_event_type_); - * if (!id) - * return -ENODEV; - * event_.id = id; - * return 0; - * } - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .system = "", - * .raw_init = ftrace_raw_init_event_, - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * .show_format = ftrace_format_, - * } - * - */ - -#undef TP_FMT -#define TP_FMT(fmt, args...) fmt "\n", ##args - -#ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ -{ \ - if (atomic_add_negative(-1, &call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} - -#define _TRACE_PROFILE_INIT(call) \ - .profile_count = ATOMIC_INIT(-1), \ - .profile_enable = ftrace_profile_enable_##call, \ - .profile_disable = ftrace_profile_disable_##call, - -#else -#define _TRACE_PROFILE(call, proto, args) -#define _TRACE_PROFILE_INIT(call) -#endif - -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#undef __entry -#define __entry entry - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ - \ -static struct ftrace_event_call event_##call; \ - \ -static void ftrace_raw_event_##call(proto) \ -{ \ - struct ftrace_event_call *call = &event_##call; \ - struct ring_buffer_event *event; \ - struct ftrace_raw_##call *entry; \ - unsigned long irq_flags; \ - int pc; \ - \ - local_save_flags(irq_flags); \ - pc = preempt_count(); \ - \ - event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ - if (!event) \ - return; \ - entry = ring_buffer_event_data(event); \ - \ - assign; \ - \ - if (!filter_current_check_discard(call, entry, event)) \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ -} \ - \ -static int ftrace_raw_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_raw_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_raw_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_raw_event_##call); \ -} \ - \ -static struct trace_event ftrace_event_type_##call = { \ - .trace = ftrace_raw_output_##call, \ -}; \ - \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(&ftrace_event_type_##call); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ - return 0; \ -} \ - \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .regfunc = ftrace_raw_reg_event_##call, \ - .unregfunc = ftrace_raw_unreg_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#include - -#undef _TRACE_PROFILE -#undef _TRACE_PROFILE_INIT - -- cgit v1.2.3 From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 13:52:20 -0400 Subject: tracing/events: convert event call sites to use a link list Impact: makes it possible to define events in modules The events are created by reading down the section that they are linked in by the macros. But this is not scalable to modules. This patch converts the manipulations to use a global link list, and on boot up it adds the items in the section to the list. This change will allow modules to add their tracing events to the list as well. Note, this change alone does not permit modules to use the TRACE_EVENT macros, but the change is needed for them to eventually do so. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 13 +--------- kernel/trace/trace_event_profile.c | 4 +-- kernel/trace/trace_events.c | 51 +++++++++++++++++++++++--------------- kernel/trace/trace_events_filter.c | 8 +++--- 4 files changed, 38 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bcdf4af9b2..8817c18ef97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -739,11 +739,6 @@ struct event_subsystem { struct filter_pred **preds; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 199de9c7422..7bf2ad65eee 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_enable(event); } @@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_disable(event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ead68ac9919..5c66aaff07c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,8 @@ static DEFINE_MUTEX(event_mutex); +LIST_HEAD(ftrace_events); + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size) { @@ -54,16 +56,14 @@ err: static void ftrace_clear_events(void) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; - + struct ftrace_event_call *call; - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { call->enabled = 0; call->unregfunc(); } - call++; } } @@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; char *event = NULL, *sub = NULL, *match; int ret = -EINVAL; @@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - for_each_event(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; @@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } @@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } @@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - m->private = __start_ftrace_events; + m->private = ftrace_events.next; } return ret; } @@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; } +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; + +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -830,6 +840,7 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + list_add(&call->list, &ftrace_events); event_create_dir(call, d_events); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index de42dad42a8..d30b06b02b4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -223,7 +223,7 @@ oom: void filter_free_subsystem_preds(struct event_subsystem *system) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; int i; if (system->n_preds) { @@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) system->n_preds = 0; } - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); @@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[system->n_preds] = pred; - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { int err; if (!call->define_fields) -- cgit v1.2.3 From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 18:12:50 -0400 Subject: tracing/events: add export symbols for trace events in modules Impact: let modules add trace events The trace event code requires some functions to be exported to allow modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the necessary functions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 2 ++ kernel/trace/trace_output.c | 3 +++ 4 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0047fcf707..2d69b26b3cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec, { return filter_check_discard(call, rec, global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(filter_current_check_discard); cycle_t ftrace_now(int cpu) { @@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) @@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_function(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5c66aaff07c..8b9e621b80b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -53,6 +53,7 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); static void ftrace_clear_events(void) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d30b06b02b4..f8e5eab0424 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } +EXPORT_SYMBOL_GPL(filter_match_preds); void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s) @@ -220,6 +221,7 @@ oom: return -ENOMEM; } +EXPORT_SYMBOL_GPL(init_preds); void filter_free_subsystem_preds(struct event_subsystem *system) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0e70fb07ca7..83a8abb9640 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event) return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); /** * unregister_ftrace_event - remove a no longer used event @@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event) return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events -- cgit v1.2.3 From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 14:53:50 -0400 Subject: tracing/events: add support for modules to TRACE_EVENT Impact: allow modules to add TRACE_EVENTS on load This patch adds the final hooks to allow modules to use the TRACE_EVENT macro. A notifier and a data structure are used to link the TRACE_EVENTs defined in the module to connect them with the ftrace event tracing system. It also adds the necessary automated clean ups to the trace events when a module is removed. Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 7 +++ kernel/trace/trace_events.c | 128 +++++++++++++++++++++++++++++++++----------- 2 files changed, 103 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e797812a4d9..a0394706f10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include @@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8b9e621b80b..a4b177720a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); system->preds = NULL; @@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); if (call->define_fields) { ret = call->define_fields(); @@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + &ftrace_event_format_fops); + + return 0; +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_event_call *call, *p; + + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->event) + unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + } + } +} + +int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); return 0; } +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - static __init int event_trace_init(void) { struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -837,7 +897,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; @@ -845,6 +905,10 @@ static __init int event_trace_init(void) event_create_dir(call, d_events); } + ret = register_module_notifier(&trace_module_nb); + if (!ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); -- cgit v1.2.3 From 19e4529ee7345079eeacc8e40cf69a304a64dc23 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 14 Apr 2009 17:27:18 +1000 Subject: modules: Fix up build when CONFIG_MODULE_UNLOAD=n. Commit 3d43321b7015387cfebbe26436d0e9d299162ea1 ("modules: sysctl to block module loading") introduces a modules_disabled variable that is only defined if CONFIG_MODULE_UNLOAD is enabled, despite being used in other places. This moves it up and fixes up the build. CC kernel/module.o kernel/module.c: In function 'sys_init_module': kernel/module.c:2401: error: 'modules_disabled' undeclared (first use in this function) kernel/module.c:2401: error: (Each undeclared identifier is reported only once kernel/module.c:2401: error: for each function it appears in.) make[1]: *** [kernel/module.o] Error 1 make: *** [kernel/module.o] Error 2 Signed-off-by: Paul Mundt Signed-off-by: James Morris --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index eeb3f7b1383..ee7ab612daf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -71,6 +71,9 @@ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); +/* Block module loading/unloading? */ +int modules_disabled = 0; + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -778,9 +781,6 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } -/* Block module loading/unloading? */ -int modules_disabled = 0; - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { -- cgit v1.2.3 From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:22:32 -0400 Subject: tracing/events: fix compile for modules disabled Impact: compile fix The addition of TRACE_EVENT for modules breaks the build for when modules are disabled. This code fixes that. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a4b177720a6..6591d83e1e7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) (unsigned long)event < (unsigned long)end; \ event++) +#ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct ftrace_event_call *call, *start, *end; @@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod) } } -int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; @@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self, return 0; } +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, -- cgit v1.2.3 From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- kernel/exit.c | 2 +- kernel/fork.c | 3 ++- kernel/irq/handle.c | 2 +- kernel/kthread.c | 2 +- kernel/lockdep.c | 2 +- kernel/sched.c | 2 +- kernel/signal.c | 2 +- kernel/softirq.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 11 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2fe9d2c7eee..cab535c427b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 4bebf263923..085f73ebcea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include @@ -71,6 +70,8 @@ #include #include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 983d8be8dff..37c63633e78 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,7 +20,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include #include "internals.h" diff --git a/kernel/kthread.c b/kernel/kthread.c index e1c76924545..41c88fe4050 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #define KTHREAD_NICE_LEVEL (-5) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 257f21a76c5..47b201ecc6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -48,7 +48,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include +#include #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; diff --git a/kernel/sched.c b/kernel/sched.c index e6d4518d47e..9f7ffd00b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,7 +79,7 @@ #include "sched_cpupri.h" #define CREATE_TRACE_POINTS -#include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] diff --git a/kernel/signal.c b/kernel/signal.c index 1d5703ff003..94ec0a4dde0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/softirq.c b/kernel/softirq.c index a2d9b458ac2..7ab9dfd8d08 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e6a0b5c994..a2348898858 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9d8cccdfaa0..a98106dd979 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153..b8b13c5540f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "trace.h" -- cgit v1.2.3 From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Apr 2009 09:59:10 +0800 Subject: sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) Impact: cleanup This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu() for maintainability. Signed-off-by: Miao Xie Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 92b4b56ad09..7601ceebf7c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; -- cgit v1.2.3 From ff7b1b4f000cea84f071c1b6aa2918b2119d66f1 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 15 Apr 2009 16:55:05 +0100 Subject: perfcounters: export perf_tpcounter_event Needed for modular tracepoint support. Signed-off-by: Steven Whitehouse Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7f9521c3c01..09396098dd0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2590,6 +2590,7 @@ void perf_tpcounter_event(int event_id) __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } +EXPORT_SYMBOL_GPL(perf_tpcounter_event); extern int ftrace_profile_enable(int); extern void ftrace_profile_disable(int); -- cgit v1.2.3 From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001 From: Shawn Du Date: Tue, 14 Apr 2009 13:58:56 +0800 Subject: blktrace: support per-partition tracing Though one can specify '-d /dev/sda1' when using blktrace, it still traces the whole sda. To support per-partition tracing, when we start tracing, we initialize bt->start_lba and bt->end_lba to the start and end sector of that partition. Note some actions are per device, thus we don't filter 0-sector events. The original patch and discussion can be found here: http://marc.info/?l=linux-btrace&m=122949374214540&w=2 Signed-off-by: Shawn Du Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe LKML-Reference: <49E42620.4050701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2b98195b338..e932654cf59 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = { * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; + struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else bt->end_lba = -1ULL; + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +517,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; -- cgit v1.2.3 From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 13:59:34 +0800 Subject: blktrace: support per-partition tracing for ftrace plugin The previous patch adds support to trace a single partition for relay+ioctl blktrace, and this patch is for ftrace plugin blktrace: # echo 1 > /sys/block/sda/sda7/enable # cat start_lba 102398373 # cat end_lba 102703545 Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42646.4060608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e932654cf59..d1098988052 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ @@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; - struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; - } else - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) @@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) -- cgit v1.2.3 From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 14:00:05 +0800 Subject: blktrace: add trace/ to /sys/block/sda Impact: allow ftrace-plugin blktrace to trace device-mapper devices To trace a single partition: # echo 1 > /sys/block/sda/sda1/enable To trace the whole sda instead: # echo 1 > /sys/block/sda/enable Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace can't be used to trace device-mapper devices. Now: # echo 1 > /sys/block/dm-0/trace/enable echo: write error: No such device or address # mount -t ext4 /dev/dm-0 /mnt # echo 1 > /sys/block/dm-0/trace/enable # echo blk > /debug/tracing/current_tracer Reported-by: Theodore Tso Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42665.6020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d1098988052..8e7c5da3a3e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1644,3 +1644,8 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + -- cgit v1.2.3 From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 15 Apr 2009 11:02:56 +0800 Subject: blktrace: fix context-info when mixed-using blk tracer and trace events When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is unset, but actually context-info is printed: pdflush-431 [000] 821.181576: 8,0 P N [pdflush] And then if we enable TRACE_ITER_CONTEXT_INFO: # echo context-info > trace_options We'll see context-info printed twice. What's worse, when we use blk tracer and trace events at the same time, we'll see no context-info for trace events at all: jbd2_commit_logging: dev dm-0:8 transaction 333227 jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814 rm-25433 [001] 9578.307485: 8,18 m N cfq25433 slice expired t=0 rm-25433 [001] 9578.307486: 8,18 m N cfq25433 put_queue This patch adds blk_tracer->set_flags(), and context-info flag is unset only when we set the output to classic mode. Note after this patch, one should unset context-info explicitly if he wants to get binary output that can be parsed by blkparse: # echo nocontext-info > trace_options # echo bin > trace_options # echo blk > current_tracer # cat trace_pipe | blkparse -i - Reported-by: Theodore Ts'o Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E54E60.50408@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8e7c5da3a3e..c32062bd10b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1289,9 +1287,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { -- cgit v1.2.3 From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:24:06 -0400 Subject: ftrace: use module notifier for function tracer The hooks in the module code for the function tracer must be called before any of that module code runs. The function tracer hooks modify the module (replacing calls to mcount to nops). If the code is executed while the change occurs, then the CPU can take a GPF. To handle the above with a bit of paranoia, I originally implemented the hooks as calls directly from the module code. After examining the notifier calls, it looks as though the start up notify is called before any of the module's code is executed. This makes the use of the notify safe with ftrace. Only the startup notify is required to be "safe". The shutdown simply removes the entries from the ftrace function list, and does not modify any code. This change has another benefit. It removes a issue with a reverse dependency in the mutexes of ftrace_lock and module_mutex. [ Impact: fix lock dependency bug, cleanup ] Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 19 ++++------- kernel/trace/ftrace.c | 90 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a0394706f10..2383e60fcf3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1490,9 +1490,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->trace_events), &mod->num_trace_events); #endif - +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a2348898858..5b606f45b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2791,6 +2825,10 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (!ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + return; failed: ftrace_disabled = 1; -- cgit v1.2.3 From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:36:40 -0400 Subject: tracing/events: add startup tests for events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As events start to become popular, and the new way to add tracing infrastructure into ftrace, it is important to catch any problems that might happen with a mistake in the TRACE_EVENT macro. This patch introduces a startup self test on the registered trace events. Note, it can only do a generic test, any type of testing that needs more involement is needed to be implemented by the tracepoint creators. The test goes down one by one enabling a trace point and running some random tasks (random in the sense that I just made them up). Those tasks are creating threads, grabbing mutexes and spinlocks and using workqueues. After testing each event individually, it does the same test after enabling each system of trace points. Like sched, irq, lockdep. Then finally it enables all tracepoints and performs the tasks again. The output to the console on bootup will look like this when everything works: Running tests on trace events: Testing event kfree_skb: OK Testing event kmalloc: OK Testing event kmem_cache_alloc: OK Testing event kmalloc_node: OK Testing event kmem_cache_alloc_node: OK Testing event kfree: OK Testing event kmem_cache_free: OK Testing event irq_handler_exit: OK Testing event irq_handler_entry: OK Testing event softirq_entry: OK Testing event softirq_exit: OK Testing event lock_acquire: OK Testing event lock_release: OK Testing event sched_kthread_stop: OK Testing event sched_kthread_stop_ret: OK Testing event sched_wait_task: OK Testing event sched_wakeup: OK Testing event sched_wakeup_new: OK Testing event sched_switch: OK Testing event sched_migrate_task: OK Testing event sched_process_free: OK Testing event sched_process_exit: OK Testing event sched_process_wait: OK Testing event sched_process_fork: OK Testing event sched_signal_send: OK Running tests on trace event systems: Testing event system skb: OK Testing event system kmem: OK Testing event system irq: OK Testing event system lockdep: OK Testing event system sched: OK Running tests on all trace events: Testing all events: OK [ folded in: tracing: add #include to fix build failure in test_work() This build failure occured on a few rare configs: kernel/trace/trace_events.c: In function ‘test_work’: kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’ kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’ delay.h is included in way too many other headers, hiding cases where new usage is added without header inclusion. [ Impact: build fix ] Signed-off-by: Ingo Molnar ] [ Impact: add event tracer self-tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6591d83e1e7..f81d6eec4e4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,10 +8,14 @@ * */ +#include +#include +#include #include #include #include #include +#include #include "trace_output.h" @@ -920,3 +924,177 @@ static __init int event_trace_init(void) return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init int event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + char *sysname; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + call->enabled = 1; + call->regfunc(); + + event_test_stuff(); + + call->unregfunc(); + call->enabled = 0; + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + /* ftrace_set_clr_event can modify the name passed in. */ + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 1); + kfree(sysname); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + sysname = kmalloc(4, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 1); + if (WARN_ON_ONCE(ret)) { + kfree(sysname); + pr_warning("error enabling all events\n"); + return 0; + } + + event_test_stuff(); + + /* reset sysname */ + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return 0; + } + + pr_cont("OK\n"); + + return 0; +} + +late_initcall(event_trace_self_tests); + +#endif -- cgit v1.2.3 From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 16:53:47 -0400 Subject: tracing/events/ring-buffer: expose format of ring buffer headers to users Currently, every thing needed to read the binary output from the ring buffers is available, with the exception of the way the ring buffers handles itself internally. This patch creates two special files in the debugfs/tracing/events directory: # cat /debug/tracing/events/header_page field: u64 timestamp; offset:0; size:8; field: local_t commit; offset:8; size:8; field: char data; offset:16; size:4080; # cat /debug/tracing/events/header_event type : 2 bits len : 3 bits time_delta : 27 bits array : 32 bits padding : type == 0 time_extend : type == 1 data : type == 3 This is to allow a userspace app to see if the ring buffer format changes or not. [ Impact: allow userspace apps to know of ringbuffer format changes ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f935bd5ec3e..84a6055f37c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,28 @@ #include "trace.h" +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "\ttype : 2 bits\n"); + ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata : type == %d\n", + RINGBUF_TYPE_DATA); + + return ret; +} + /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is @@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f81d6eec4e4..7163a2bb021 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -909,6 +938,15 @@ static __init int event_trace_init(void) if (!d_events) return 0; + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) -- cgit v1.2.3 From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001 From: Avadh Patel Date: Fri, 10 Apr 2009 16:04:48 -0400 Subject: tracing: add saved_cmdlines file to show cached task comms Export the cached task comms to userspace. This allows user apps to translate the pids from a trace into their respective task command lines. [ Impact: let userspace apps reading binary buffer know comm's of pids ] Signed-off-by: Avadh Patel [ added error checking and use of buf pointer to index file_buf ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d69b26b3cc..031c46f11bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; +static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); -- cgit v1.2.3 From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 12:15:44 -0400 Subject: tracing/events: perform function tracing in event selftests We can find some bugs in the trace events if we stress the writes as well. The function tracer is a good way to stress the events. [ Impact: extend scope of event tracer self-tests ] Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20090416161746.604786131@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 78 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7163a2bb021..1137f951be4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void) * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ -static __init int event_trace_self_tests(void) +static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; @@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 1); kfree(sysname); @@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 0); kfree(sysname); @@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void) sysname = kmalloc(4, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } memcpy(sysname, "*:*", 4); ret = ftrace_set_clr_event(sysname, 1); if (WARN_ON_ONCE(ret)) { kfree(sysname); pr_warning("error enabling all events\n"); - return 0; + return; } event_test_stuff(); @@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void) if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); - return 0; + return; } pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_current_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); return 0; } -- cgit v1.2.3 From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 Apr 2009 10:34:30 +0800 Subject: tracing: fix file mode of trace and README trace is read-write and README is read-only. [ Impact: fix /debug/tracing/ file permissions. ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 031c46f11bb..f681f646aa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); - trace_create_file("current_tracer", 0444, d_tracer, + trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); trace_create_file("tracing_max_latency", 0644, d_tracer, @@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); - trace_create_file("README", 0644, d_tracer, + trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); trace_create_file("trace_pipe", 0444, d_tracer, -- cgit v1.2.3 From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 17 Apr 2009 00:27:08 -0500 Subject: tracing/filters: add filter_mutex to protect filter predicates This patch adds a filter_mutex to prevent the filter predicates from being accessed concurrently by various external functions. It's based on a previous patch by Li Zefan: "[PATCH 7/7] tracing/filters: make filter preds RCU safe" v2 changes: - fixed wrong value returned in a add_subsystem_pred() failure case noticed by Li Zefan. [ Impact: fix trace filter corruption/crashes on parallel access ] Signed-off-by: Tom Zanussi Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239946028.6639.13.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 4 +- kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++++++++--------- 3 files changed, 75 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8817c18ef97..247948e81b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,13 +757,15 @@ struct filter_pred { }; extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, int n_preds, +extern void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1137f951be4..64f9d6d2735 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, call->n_preds, s); + filter_print_preds(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, system->n_preds, s); + filter_print_subsystem_preds(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f8e5eab0424..e0fcfd2a16d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,10 +22,13 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" +static DEFINE_MUTEX(filter_mutex); + static int filter_pred_64(struct filter_pred *pred, void *event) { u64 *addr = (u64 *)(event + pred->offset); @@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -void filter_print_preds(struct filter_pred **preds, int n_preds, - struct trace_seq *s) +static void __filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; @@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds, } } +void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(call->preds, call->n_preds, s); + mutex_unlock(&filter_mutex); +} + +void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(system->preds, system->n_preds, s); + mutex_unlock(&filter_mutex); +} + static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { @@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } -void filter_disable_preds(struct ftrace_event_call *call) +static void __filter_disable_preds(struct ftrace_event_call *call) { int i; @@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call) call->preds[i]->fn = filter_pred_none; } +void filter_disable_preds(struct ftrace_event_call *call) +{ + mutex_lock(&filter_mutex); + __filter_disable_preds(call); + mutex_unlock(&filter_mutex); +} + int init_preds(struct ftrace_event_call *call) { struct filter_pred *pred; @@ -223,7 +248,7 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -void filter_free_subsystem_preds(struct event_subsystem *system) +static void __filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; int i; @@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_disable_preds(call); + __filter_disable_preds(call); } } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred, - filter_pred_fn_t fn) +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + mutex_lock(&filter_mutex); + __filter_free_subsystem_preds(system); + mutex_unlock(&filter_mutex); +} + +static int filter_add_pred_fn(struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) { int idx, err; if (call->n_preds && !pred->compound) - filter_disable_preds(call); + __filter_disable_preds(call); if (call->n_preds == MAX_FILTER_PRED) return -ENOSPC; @@ -276,7 +308,8 @@ static int is_string_field(const char *type) return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); } else { if (pred->str_len) return -EINVAL; @@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; } - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + int err; + + mutex_lock(&filter_mutex); + err = __filter_add_pred(call, pred); + mutex_unlock(&filter_mutex); + + return err; } int filter_add_subsystem_pred(struct event_subsystem *system, @@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system, { struct ftrace_event_call *call; + mutex_lock(&filter_mutex); + if (system->n_preds && !pred->compound) - filter_free_subsystem_preds(system); + __filter_free_subsystem_preds(system); if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) + if (!system->preds) { + mutex_unlock(&filter_mutex); return -ENOMEM; + } } - if (system->n_preds == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) { + mutex_unlock(&filter_mutex); return -ENOSPC; + } system->preds[system->n_preds] = pred; + system->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - if (!find_event_field(call, pred->field_name)) - continue; - - err = filter_add_pred(call, pred); + err = __filter_add_pred(call, pred); if (err == -ENOMEM) { system->preds[system->n_preds] = NULL; + system->n_preds--; + mutex_unlock(&filter_mutex); return err; } } - system->n_preds++; + mutex_unlock(&filter_mutex); return 0; } -- cgit v1.2.3 From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:01:56 -0400 Subject: tracing: add EXPORT_SYMBOL_GPL for trace commits Not all the necessary symbols were exported to allow for tracing by modules. This patch adds them in. [ Impact: allow modules to commit data to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f681f646aa0..183d788038e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } +EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, -- cgit v1.2.3 From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 21:41:52 -0400 Subject: tracing: add same level recursion detection The tracing infrastructure allows for recursion. That is, an interrupt may interrupt the act of tracing an event, and that interrupt may very well perform its own trace. This is a recursive trace, and is fine to do. The problem arises when there is a bug, and the utility doing the trace calls something that recurses back into the tracer. This recursion is not caused by an external event like an interrupt, but by code that is not expected to recurse. The result could be a lockup. This patch adds a bitmask to the task structure that keeps track of the trace recursion. To find the interrupt depth, the following algorithm is used: level = hardirq_count() + softirq_count() + in_nmi; Here, level will be the depth of interrutps and softirqs, and even handles the nmi. Then the corresponding bit is set in the recursion bitmask. If the bit was already set, we know we had a recursion at the same level and we warn about it and fail the writing to the buffer. After the data has been committed to the buffer, we clear the bit. No atomics are needed. The only races are with interrupts and they reset the bitmask before returning anywy. [ Impact: detect same irq level trace recursion ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 84a6055f37c..b421b0ea911 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static int trace_irq_level(void) +{ + return hardirq_count() + softirq_count() + in_nmi(); +} + +static int trace_recursive_lock(void) +{ + int level; + + level = trace_irq_level(); + + if (unlikely(current->trace_recursion & (1 << level))) { + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + WARN_ON_ONCE(1); + return -1; + } + + current->trace_recursion |= 1 << level; + + return 0; +} + +static void trace_recursive_unlock(void) +{ + int level; + + level = trace_irq_level(); + + WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + + current->trace_recursion &= ~(1 << level); +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ -- cgit v1.2.3 From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:13:55 -0400 Subject: tracing: protect trace_printk from recursion trace_printk can be called from any context, including NMIs. If this happens, then we must test for for recursion before grabbing any spinlocks. This patch prevents trace_printk from being called recursively. [ Impact: prevent hard lockup in lockdep event tracer ] Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183d788038e..b9a3adce922 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1301,6 +1303,7 @@ out_unlock: local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; -- cgit v1.2.3 From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:38:12 +0200 Subject: tracing/core: Add current context on tracing recursion warning In case of tracing recursion detection, we only get the stacktrace. But the current context may be very useful to debug the issue. This patch adds the softirq/hardirq/nmi context with the warning using lockdep context display to have a familiar output. v2: Use printk_once() v3: drop {hardirq,softirq}_context which depend on lockdep, only keep what is part of current->trace_recursion, sufficient to debug the warning source. [ Impact: print context necessary to debug recursion ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b421b0ea911..bffde630c4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void) if (unlikely(current->trace_recursion & (1 << level))) { /* Disable all tracing before we do anything else */ tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: " + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + WARN_ON_ONCE(1); return -1; } -- cgit v1.2.3 From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:39:33 +0200 Subject: tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [] warn_slowpath+0xd8/0x130 [ 111.119873] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [] ? native_sched_clock+0x20/0x70 [ 111.119899] [] ? put_lock_stats+0xe/0x30 [ 111.119906] [] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [] __schedule+0x3a7/0x4cd [ 111.119957] [] ? ftrace_call+0x5/0x2b [ 111.119964] [] ? ftrace_call+0x5/0x2b [ 111.119971] [] schedule+0x18/0x40 [ 111.119977] [] preempt_schedule+0x39/0x60 [ 111.119985] [] _read_unlock+0x53/0x60 [ 111.119993] [] sock_def_readable+0x72/0x80 [ 111.120002] [] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [] sock_aio_write+0x143/0x160 [ 111.120019] [] ? ftrace_call+0x5/0x2b [ 111.120026] [] ? sock_aio_write+0x0/0x160 [ 111.120033] [] ? sock_aio_write+0x0/0x160 [ 111.120042] [] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [] ? ftrace_call+0x5/0x2b [ 111.120057] [] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [] ? cap_file_permission+0x9/0x10 [ 111.120074] [] ? security_file_permission+0x16/0x20 [ 111.120082] [] do_readv_writev+0xd4/0x1f0 [ 111.120089] [] ? ftrace_call+0x5/0x2b [ 111.120097] [] ? ftrace_call+0x5/0x2b [ 111.120105] [] vfs_writev+0x48/0x70 [ 111.120111] [] sys_writev+0x55/0xc0 [ 111.120119] [] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bffde630c4e..e145969a8ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + /** * ring_buffer_event_discard - discard any event in the ring buffer * @event: the event to discard @@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); */ void ring_buffer_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; + rb_event_discard(event); + trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); @@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, int cpu; /* The event is discarded regardless */ - ring_buffer_event_discard(event); + rb_event_discard(event); /* * This must only be called if the event has not been @@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, if (rb_is_commit(cpu_buffer, event)) rb_set_commit_to_write(cpu_buffer); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ -- cgit v1.2.3 From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:27:58 -0400 Subject: tracing: change branch profiling to a choice selection This patch makes the branch profiling into a choice selection: None - no branch profiling likely/unlikely - only profile likely/unlikely branches all - profile all branches The all profiler will also enable the likely/unlikely branches. This does not change the way the profiler works or the dependencies between the profilers. What this patch does, is keep the branch profiling from being selected by an allyesconfig make. The branch profiler is very intrusive and it is known to break various architecture builds when selected as an allyesconfig. [ Impact: prevent branch profiler from being selected in allyesconfig ] Reported-by: Heiko Carstens Reported-by: Al Viro Reported-by: Stephen Rothwell Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 57981d338d1..3ee28db69be 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -212,8 +212,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING - bool "Trace likely/unlikely profiler" + bool select TRACING + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool -- cgit v1.2.3 From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:47:36 -0400 Subject: tracing: create menuconfig for tracing infrastructure During testing we often use randconfig to test various kernels. The current configuration set up does not give an easy way to disable all tracing with a single config. The case where randconfig would test all tracing disabled is very unlikely. This patch adds a config option to enable or disable all tracing. It is hooked into the tracing menu just like other submenus are done. [ Impact: allow randconfig to easily produce all traces disabled ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee28db69be..3fa36d2bc29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,12 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" @@ -462,7 +467,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +endif # FTRACE endif # TRACING_SUPPORT -- cgit v1.2.3 From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:59:34 -0400 Subject: tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING Currently we have two configs: EVENT_TRACING and EVENT_TRACER. All tracers enable EVENT_TRACING. The EVENT_TRACER is only a convenience to enable the EVENT_TRACING when no other tracers are enabled. The names EVENT_TRACER and EVENT_TRACING are too similar and confusing. This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more appropriate to what it actually does, as well as add a comment in the help menu to explain the option's purpose. [ Impact: rename config option to reduce confusion ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3fa36d2bc29..450d3c2cfbd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. -config EVENT_TRACER +config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" select TRACING help @@ -191,6 +191,10 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. + Note, all tracers enable event tracing. This option is + only a convenience to enable event tracing when no other + tracers are selected. + config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS -- cgit v1.2.3 From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:12:44 -0400 Subject: tracing/events: call the correct event trace selftest init function The late_initcall calls a helper function instead of the proper init event selftest function. This update may have been lost due to conflicting merges. [ Impact: fix compiler warning and call extended event trace self tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64f9d6d2735..98daf5dc74a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void) return 0; } -late_initcall(event_trace_self_tests); +late_initcall(event_trace_self_tests_init); #endif -- cgit v1.2.3 From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:24:21 -0400 Subject: tracing: fix recursive test level calculation The recursive tests to detect same level recursion in the ring buffers did not account for the hard/softirq_counts to be shifted. Thus the numbers could be larger than then mask to be tested. This patch includes the shift for the calculation of the irq depth. [ Impact: stop false positives in trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e145969a8ed..aa40ae92233 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, static int trace_irq_level(void) { - return hardirq_count() + softirq_count() + in_nmi(); + return (hardirq_count() >> HARDIRQ_SHIFT) + + (softirq_count() >> + SOFTIRQ_SHIFT) + + !!in_nmi(); } static int trace_recursive_lock(void) -- cgit v1.2.3 From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:32:44 -0400 Subject: tracing: remove recursive test from ring_buffer_event_discard The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve. It can be called inside or outside the reserve/commit. Even if it is called inside the reserve/commit the commit part must also be called. Only ring_buffer_discard_commit can be used as a replacement for ring_buffer_unlock_commit. This patch removes the trace_recursive_unlock from ring_buffer_event_discard since it would be the wrong place to do so. [Impact: prevent breakage in trace recursive testing ] Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aa40ae92233..a6997670cc4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event) void ring_buffer_event_discard(struct ring_buffer_event *event) { rb_event_discard(event); - trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); -- cgit v1.2.3 From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 20:38:21 +0200 Subject: perfcounters, sched: remove __task_delta_exec() This function was left orphan by the latest round of sw-counter cleanups. [ Impact: remove unused kernel function ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b66a08c2480..a69278eef42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); -/* - * Return any ns on the sched_clock that have not yet been banked in - * @p in case that task is currently running. - */ -unsigned long long __task_delta_exec(struct task_struct *p, int update) -{ - s64 delta_exec; - struct rq *rq; - - rq = task_rq(p); - WARN_ON_ONCE(!runqueue_is_locked()); - WARN_ON_ONCE(!task_current(rq, p)); - - if (update) - update_rq_clock(rq); - - delta_exec = rq->clock - p->se.exec_start; - - WARN_ON_ONCE(delta_exec < 0); - - return delta_exec; -} - /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. -- cgit v1.2.3 From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:26 +0100 Subject: sched: remove extra call overhead for schedule() Lai Jiangshan's patch reminded me that I promised Nick to remove that extra call overhead in schedule(). Signed-off-by: Peter Zijlstra LKML-Reference: <20090313112300.927414207@chello.nl> Signed-off-by: Ingo Molnar --- kernel/mutex.c | 4 +++- kernel/sched.c | 12 ++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a..e1fb7351040 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -248,7 +248,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 7601ceebf7c..797f6fdabad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5194,15 +5196,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); -- cgit v1.2.3 From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 16:16:11 -0400 Subject: tracing: use recursive counter over irq level Althought using the irq level (hardirq_count, softirq_count and in_nmi) was nice to detect bad recursion right away, but since the counters are not atomically updated with respect to the interrupts, the function tracer might trigger the test from an interrupt handler before the hardirq_count is updated. This will trigger a false warning. This patch converts the recursive detection to a simple counter. If the depth is greater than 16 then the recursive detection will trigger. 16 is more than enough for any nested interrupts. [ Impact: fix false positive trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6997670cc4..7bcfd3e6053 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static int trace_irq_level(void) -{ - return (hardirq_count() >> HARDIRQ_SHIFT) + - (softirq_count() >> + SOFTIRQ_SHIFT) + - !!in_nmi(); -} +#define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) { - int level; - - level = trace_irq_level(); + current->trace_recursion++; - if (unlikely(current->trace_recursion & (1 << level))) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; - printk_once(KERN_WARNING "Tracing recursion: " - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); - WARN_ON_ONCE(1); - return -1; - } + printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); - current->trace_recursion |= 1 << level; - - return 0; + WARN_ON_ONCE(1); + return -1; } static void trace_recursive_unlock(void) { - int level; - - level = trace_irq_level(); - - WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + WARN_ON_ONCE(!current->trace_recursion); - current->trace_recursion &= ~(1 << level); + current->trace_recursion--; } static DEFINE_PER_CPU(int, rb_need_resched); -- cgit v1.2.3 From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 18:16:44 -0400 Subject: tracing: use nowakeup version of commit for function event trace tests The startup tests for the event tracer also runs with the function tracer enabled. The "wakeup" version of the trace commit was used which can grab spinlocks. If a task was preempted by an NMI that called a function being traced, it could deadlock due to the function tracer trying to grab the same lock. Thanks to Frederic Weisbecker for pointing out where the bug was. Reported-by: Ingo Molnar Reported-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98daf5dc74a..672b195f86c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_current_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); -- cgit v1.2.3 From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 21 Apr 2009 08:40:49 +0530 Subject: sched: Replace first_cpu() with cpumask_first() in ILB nomination code Stephen Rothwell reported this build warning: > kernel/sched.c: In function 'find_new_ilb': > kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type > > Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b > ("sched: Nominate idle load balancer from a semi-idle package") from > the sched tree. Should this call to first_cpu be cpumask_first? For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the Idle load balancer as the first cpu from the nohz.cpu_mask. This code uses the older API first_cpu(). Replace it with cpumask_first(), which is the correct API here. [ Impact: cleanup, address build warning ] Reported-by: Stephen Rothwell Signed-off-by: Gautham R Shenoy Cc: Rusty Russell LKML-Reference: <20090421031049.GA4140@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 797f6fdabad..54d67b94f1a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4356,7 +4356,7 @@ out_done: #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return first_cpu(nohz.cpu_mask); + return cpumask_first(nohz.cpu_mask); } #endif -- cgit v1.2.3 From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:11:46 +0800 Subject: tracing/filters: don't remove old filters when failed to write subsys->filter If writing subsys->filter returns EINVAL or ENOSPC, the original filters in subsys/ and subsys/events/ will be removed. This is definitely wrong. [ Impact: fix filter setting semantics on error condition ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 672b195f86c..9ea55a7dfde 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = filter_add_subsystem_pred(system, pred); if (err < 0) { - filter_free_subsystem_preds(system); filter_free_pred(pred); return err; } -- cgit v1.2.3 From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:12:11 +0800 Subject: tracing/filters: allow user-input to be integer-like string Suppose we would like to trace all tasks named '123', but this will fail: # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter bash: echo: write error: Invalid argument Don't guess the type of the filter pred in filter_parse(), but instead we check it in __filter_add_pred(). [ Impact: extend allowed filter field string values ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e0fcfd2a16d..65418288f95 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, { struct ftrace_event_field *field; filter_pred_fn_t fn; + unsigned long long val; field = find_event_field(call, pred->field_name); if (!field) @@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call, pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_len) - return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; return filter_add_pred_fn(call, pred, fn); } else { - if (pred->str_len) + if (strict_strtoull(pred->str_val, 0, &val)) return -EINVAL; + pred->val = val; } switch (field->size) { @@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, return 0; } +/* + * The filter format can be + * - 0, which means remove all filter preds + * - [||/&&] ==/!= + */ int filter_parse(char **pbuf, struct filter_pred *pred) { - char *tmp, *tok, *val_str = NULL; + char *tok, *val_str = NULL; int tok_n = 0; - /* field ==/!= number, or/and field ==/!= number, number */ while ((tok = strsep(pbuf, " \n"))) { if (tok_n == 0) { if (!strcmp(tok, "0")) { @@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred) return -EINVAL; } + strcpy(pred->str_val, val_str); + pred->str_len = strlen(val_str); + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; - pred->str_len = 0; - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); - pred->str_len = strlen(val_str); - pred->str_val[pred->str_len] = '\0'; - } else if (*tmp != '\0') - return -EINVAL; - return 0; } -- cgit v1.2.3 From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Apr 2009 09:41:26 -0400 Subject: ring-buffer: only warn on wrap if buffer is bigger than two pages On boot up, to save memory, ftrace allocates the minimum buffer which is two pages. Ftrace also goes through a series of tests (when configured) on boot up. These tests can fill up a page within a single interrupt. The ring buffer also has a WARN_ON when it detects that the buffer was completely filled within a single commit (other commits are allowed to be nested). Combine the small buffer on start up, with the tests that can fill more than a single page within an interrupt, this can trigger the WARN_ON. This patch makes the WARN_ON only happen when the ring buffer consists of more than two pages. [ Impact: prevent false WARN_ON in ftrace startup tests ] Reported-by: Ingo Molnar LKML-Reference: <20090421094616.GA14561@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7bcfd3e6053..61dbdf21cd3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + /* This can easily happen on small ring buffers */ + WARN_ON_ONCE(buffer->pages > 2); goto out_reset; } -- cgit v1.2.3 From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Apr 2009 16:53:34 +0800 Subject: tracing/events: make struct trace_entry->type to be int type struct trace_entry->type is unsigned char, while trace event's id is int type, thus for a event with id >= 256, it's entry->type is cast to (id % 256), and then we can't see the trace output of this event. # insmod trace-events-sample.ko # echo foo_bar > /mnt/tracing/set_event # cat /debug/tracing/events/trace-events-sample/foo_bar/id 256 # cat /mnt/tracing/trace_pipe <...>-3548 [001] 215.091142: Unknown type 0 <...>-3548 [001] 216.089207: Unknown type 0 <...>-3548 [001] 217.087271: Unknown type 0 <...>-3548 [001] 218.085332: Unknown type 0 [ Impact: fix output for trace events with id >= 256 ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9a3adce922..b6183bc9eca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 247948e81b0..7d55bcf50e4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9ea55a7dfde..5d6e879cf87 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(int, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), -- cgit v1.2.3 From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 10:25:24 -0400 Subject: tracing/wakeup: move access to wakeup_cpu into spinlock The code had the following outside the lock: if (next != wakeup_task) return; pc = preempt_count(); /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; On initialization, wakeup_task is NULL and wakeup_cpu -1. This code is not under a lock. If wakeup_task is set on another CPU as that task is waking up, we can see the wakeup_task before wakeup_cpu is set. If we read wakeup_cpu while it is still -1 then we will have a bad data pointer. This patch moves the reading of wakeup_cpu within the protection of the spinlock used to protect the writing of wakeup_cpu and wakeup_task. [ Impact: remove possible race causing invalid pointer dereference ] Reported-by: Maneesh Soni Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b8b13c5540f..eacb2722517 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); -- cgit v1.2.3 From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:03:29 -0400 Subject: tracing: increase size of number of possible events With the new event tracing registration, we must increase the number of events that can be registered. Currently the type field is only one byte, which leaves us only 256 possible events. Since we do not save the CPU number in the tracer anymore (it is determined by the per cpu ring buffer that is used) we have an extra byte to use. This patch increases the size of type from 1 byte (256 events) to 2 bytes (65,536 events). It also adds a WARN_ON_ONCE if we exceed that limit. [ Impact: allow more than 255 events ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_output.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6e879cf87..9887131afa0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(int, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 83a8abb9640..06997e75114 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); + WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); -- cgit v1.2.3 From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:43:36 -0400 Subject: tracing: add size checks for exported ftrace internal structures The events exported by TRACE_EVENT are automated and are guaranteed to be correct when used. The internal ftrace structures on the other hand are more manually exported. These require the ftrace maintainer to make sure they are up to date. This patch adds a size check to help flag when a type changes in an internal ftrace data structure, and the update needs to be reflected in the export. If a export is incorrect, then the only harm is that the user space tools will not know how to correctly read the internal structures of ftrace. [ Impact: help prevent inconsistent ftrace format print outs ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_export.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9887131afa0..b9208158808 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 48fc02fe73a..0cb1a142c74 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ -- cgit v1.2.3 From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 24 Apr 2009 11:27:05 +0800 Subject: ring_buffer: compressed event header RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes an 'u32' to save the actually length for events which data size > 28. This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA. [ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ] Signed-off-by: Lai Jiangshan LKML-Reference: <49F13189.3090000@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 83 +++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61dbdf21cd3..9692f100ec1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "\ttype : 2 bits\n"); - ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); ret = trace_seq_printf(s, "\tarray : 32 bits\n"); ret = trace_seq_printf(s, "\n"); @@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata : type == %d\n", - RINGBUF_TYPE_DATA); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return ret; } @@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -213,17 +216,18 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; cpu_buffer->entries--; @@ -1133,28 +1137,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, 0, length); if (!event) goto out; @@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; @@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + event = rb_reserve_next_event(cpu_buffer, 0, event_length); if (!event) goto out; @@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->entries--; } -- cgit v1.2.3 From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 12:20:52 -0400 Subject: tracing/events: reuse trace event ids after overflow With modules being able to add trace events, and the max trace event counter is 16 bits (65536) we can overflow the counter easily with a simple while loop adding and removing modules that contain trace events. This patch links together the registered trace events and on overflow searches for available trace event ids. It will still fail if over 65536 events are registered, but considering that a typical kernel only has 22000 functions, 65000 events should be sufficient. Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 71 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 06997e75114..5fc51f0f75f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); - WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); - return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); @@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event) { mutex_lock(&trace_event_mutex); hlist_del(&event->node); + list_del(&event->list); mutex_unlock(&trace_event_mutex); return 0; -- cgit v1.2.3 From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 23:11:22 -0400 Subject: tracing/events: make modules have their own file_operations structure For proper module reference counting, the file_operations that modules use must have the "owner" field set to the module. Unfortunately, the trace events use share file_operations. The same file_operations are used by all both kernel core and all modules. This patch makes the modules allocate their own file_operations and copies the functions from the core kernel. This allows those file operations to be owned by the module. Care is taken to free this code on module unload. Thanks to Greg KH for reminding me that file_operations must be owned by the module to have reference counting take place. [ Impact: fix modular tracepoints / potential crash ] Signed-off-by: Steven Rostedt Acked-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 95 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b9208158808..be4d3a437c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) if (call->regfunc) entry = trace_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); + enable); if (call->id) entry = trace_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); + id); if (call->define_fields) { ret = call->define_fields(); @@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return ret; } entry = trace_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); + filter); } /* A trace may not want to export its format */ @@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; entry = trace_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); + format); return 0; } @@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) event++) #ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + static void trace_module_add_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; @@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } call->mod = mod; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); } } static void trace_module_remove_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); } } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } } static int trace_module_notify(struct notifier_block *self, @@ -954,7 +1033,9 @@ static __init int event_trace_init(void) if (!call->name) continue; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } ret = register_module_notifier(&trace_module_nb); -- cgit v1.2.3 From 9ec4fa271faf2db3b8e1419c998da1ca6b094eb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:57:18 -0700 Subject: irq, cpumask: correct CPUMASKS_OFFSTACK typo and fix fallout CPUMASKS_OFFSTACK is not defined anywhere (it is CPUMASK_OFFSTACK). It is a typo and init_allocate_desc_masks() is called before it set affinity to all cpus... Split init_alloc_desc_masks() into all_desc_masks() and init_desc_masks(). Also use CPUMASK_OFFSTACK in alloc_desc_masks(). [ Impact: fix smp_affinity copying/setup when moving irq_desc between CPUs ] Signed-off-by: Yinghai Lu Acked-by: Rusty Russell Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" LKML-Reference: <49F6546E.3040406@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 9 ++++++--- kernel/irq/numa_migrate.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..882c7980010 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -115,10 +115,11 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } + init_desc_masks(desc); arch_init_chip_data(desc, cpu); } @@ -169,7 +170,8 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -256,7 +258,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d..5760d725162 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -37,7 +37,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; -- cgit v1.2.3 From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:58:23 -0700 Subject: x86/irq: remove leftover code from NUMA_MIGRATE_IRQ_DESC The original feature of migrating irq_desc dynamic was too fragile and was causing problems: it caused crashes on systems with lots of cards with MSI-X when user-space irq-balancer was enabled. We now have new patches that create irq_desc according to device numa node. This patch removes the leftover bits of the dynamic balancer. [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F654AF.8000808@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/Makefile | 2 +- kernel/irq/chip.c | 12 ++---------- kernel/irq/handle.c | 9 ++------- kernel/irq/numa_migrate.c | 2 -- 4 files changed, 5 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f5296..2f065277f8e 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f..13c68e71b72 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 882c7980010..3e0cbc44bd7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -458,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -473,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 5760d725162..ce72bc3f4ce 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -97,9 +97,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; -- cgit v1.2.3 From 57b150cce8e004ddd36330490a68bfb59b7271e9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 17:59:53 -0700 Subject: irq: only update affinity if ->set_affinity() is sucessfull irq_set_affinity() and move_masked_irq() try to assign affinity before calling chip set_affinity(). Some archs are assigning it in ->set_affinity() again. We do something like: cpumask_cpy(desc->affinity, mask); desc->chip->set_affinity(mask); But in the failure path, affinity should not be touched - otherwise we'll end up with a different affinity mask despite the failure to migrate the IRQ. So try to update the afffinity only if set_affinity returns with 0. Also call irq_set_thread_affinity accordingly. v2: update after "irq, x86: Remove IRQ_DISABLED check in process context IRQ move" v3: according to Ingo, change set_affinity() in irq_chip should return int. v4: update comments by removing moving irq_desc code. [ Impact: fix /proc/irq/*/smp_affinity setting corner case bug ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F65509.60307@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 17 +++++++++++------ kernel/irq/migration.c | 14 +++++++++----- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38..de5f412f6a9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..aaf5c9d0577 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b..cfe767ca154 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include +#include + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } -- cgit v1.2.3 From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Apr 2009 18:00:38 -0700 Subject: x86/irq: change irq_desc_alloc() to take node instead of cpu This simplifies the node awareness of the code. All our allocators only deal with a NUMA node ID locality not with CPU ids anyway - so there's no need to maintain (and transform) a CPU id all across the IRq layer. v2: keep move_irq_desc related [ Impact: cleanup, prepare IRQ code to be NUMA-aware ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell Cc: Jeremy Fitzhardinge LKML-Reference: <49F65536.2020300@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 28 +++++++++++----------------- kernel/irq/internals.h | 2 +- kernel/irq/numa_migrate.c | 36 ++++++++++++------------------------ kernel/softirq.c | 2 +- 4 files changed, 25 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3e0cbc44bd7..a6368db2618 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -81,12 +81,10 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); /* @@ -94,33 +92,32 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } init_desc_masks(desc); - arch_init_chip_data(desc, cpu); + arch_init_chip_data(desc, node); } /* @@ -189,11 +186,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -212,15 +208,13 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -270,7 +264,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index de5f412f6a9..73468253143 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ce72bc3f4ce..2f69bee57bf 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -107,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..f674f332a02 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } -- cgit v1.2.3 From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Apr 2009 11:39:34 -0400 Subject: tracing: convert ftrace_dump spinlocks to raw ftrace_dump is used for printing out the contents of the ftrace ring buffer to the console on failure. Currently it uses a spinlock to synchronize the output from multiple failures on different CPUs. This spin lock currently is a normal spinlock and can cause issues with lockdep and lock tracing. This patch converts it to raw since it is for error handling only. The lock is local to the ftrace_dump and is not used by any other infrastructure. [ Impact: prevent ftrace_dump from locking up by internal tracing ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6183bc9eca..5d704a41f83 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ -- cgit v1.2.3 From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d704a41f83..9058240c85c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { -- cgit v1.2.3 From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:23:13 -0400 Subject: tracing: only add splice page if entries exist The splice code allocates a page even when the ring buffer is empty. It detects the ring buffer being empty when it it fails to copy anything from the ring buffer into the page. This patch adds a check to see if there is anything in the ring buffer before allocating a page. Thanks to logdev for letting me trace the tracer to find this. [ Impact: speed up due to removing unnecessary allocation ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9058240c85c..0aeb3b93414 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; -- cgit v1.2.3 From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:26:30 -0400 Subject: tracing: have splice only copy full pages Splice works with pages, it is much more effecient to use an entire page than to copy bits over several pages. Using logdev to trace the internals of the splice mechanism, I was able to see that splice can be very aggressive. When tracing is occurring, and the reader caught up to the writer, and the writer is on the reader page, the reader will copy what is there into the splice page. Splice may iterate over several pages and if the writer is still writing to the page, the reader will keep copying bits to new pages to pass to userspace. This patch changes it to only pass data to userspace if the page is full (the writer has left the page). This has a small side effect that splice can not read a partial page, and must wait for the page to fill. This should not be an issue. If tracing has stopped, then a use of "read" will still read all of the page. [ Impact: better performance for ring buffer splice code ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aeb3b93414..f5427e0fc98 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); -- cgit v1.2.3 From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 27 Apr 2009 12:37:49 -0400 Subject: ring-buffer: fix printk output The warning output in trace_recursive_lock uses %d for a long when it should be %ld. [ Impact: fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9692f100ec1..f4cc59040eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void) /* Disable all tracing before we do anything else */ tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", current->trace_recursion, hardirq_count() >> HARDIRQ_SHIFT, -- cgit v1.2.3 From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:47 -0500 Subject: tracing/filters: move preds into event_filter object Create a new event_filter object, and move the pred-related members out of the call and subsystem objects and into the filter object - the details of the filter implementation don't need to be exposed in the call and subsystem in any case, and it will also help make the new parser implementation a little cleaner. [ Impact: refactor trace-filter code to prepare for new features ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905887.6416.119.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 10 ++-- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_events_filter.c | 107 +++++++++++++++++++++++-------------- 3 files changed, 74 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7d55bcf50e4..1fb7d6ccadf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -731,12 +731,16 @@ struct ftrace_event_field { int size; }; +struct event_filter { + int n_preds; + struct filter_pred **preds; +}; + struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - int n_preds; - struct filter_pred **preds; + void *filter; }; struct filter_pred; @@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index be4d3a437c1..1cd1f37373d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); - system->preds = NULL; - system->n_preds = 0; + system->filter = NULL; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 65418288f95..1e861eca3d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { + struct event_filter *filter = call->filter; int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < call->n_preds; i++) { - pred = call->preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; if (and_failed && !pred->or) continue; matched = pred->fn(pred, rec); @@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct filter_pred **preds, int n_preds, +static void __filter_print_preds(struct event_filter *filter, struct trace_seq *s) { - char *field_name; struct filter_pred *pred; + char *field_name; int i; - if (!n_preds) { + if (!filter || !filter->n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < n_preds; i++) { - pred = preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; field_name = pred->field_name; if (i) trace_seq_printf(s, pred->or ? "|| " : "&& "); @@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds, void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(call->preds, call->n_preds, s); + __filter_print_preds(call->filter, s); mutex_unlock(&filter_mutex); } @@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(system->preds, system->n_preds, s); + __filter_print_preds(system->filter, s); mutex_unlock(&filter_mutex); } @@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest, static void __filter_disable_preds(struct ftrace_event_call *call) { + struct event_filter *filter = call->filter; int i; - call->n_preds = 0; + call->filter_active = 0; + filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) - call->preds[i]->fn = filter_pred_none; + filter->preds[i]->fn = filter_pred_none; } void filter_disable_preds(struct ftrace_event_call *call) @@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call) int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; struct filter_pred *pred; int i; - call->n_preds = 0; - - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!call->preds) + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) return -ENOMEM; + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; + for (i = 0; i < MAX_FILTER_PRED; i++) { pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; - call->preds[i] = pred; + filter->preds[i] = pred; } return 0; oom: for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) - filter_free_pred(call->preds[i]); + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } - kfree(call->preds); - call->preds = NULL; + kfree(filter->preds); + kfree(call->filter); + call->filter = NULL; return -ENOMEM; } @@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds); static void __filter_free_subsystem_preds(struct event_subsystem *system) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (system->n_preds) { - for (i = 0; i < system->n_preds; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - system->n_preds = 0; + if (filter && filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + kfree(filter); + system->filter = NULL; } list_for_each_entry(call, &ftrace_events, list) { @@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { + struct event_filter *filter = call->filter; int idx, err; - if (call->n_preds && !pred->compound) + if (filter->n_preds && !pred->compound) __filter_disable_preds(call); - if (call->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) return -ENOSPC; - idx = call->n_preds; - filter_clear_pred(call->preds[idx]); - err = filter_set_pred(call->preds[idx], pred, fn); + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); if (err) return err; - call->n_preds++; + filter->n_preds++; + call->filter_active = 1; return 0; } @@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; mutex_lock(&filter_mutex); - if (system->n_preds && !pred->compound) + if (filter && filter->n_preds && !pred->compound) { __filter_free_subsystem_preds(system); + filter = NULL; + } - if (!system->n_preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + if (!filter) { + system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!system->filter) { + mutex_unlock(&filter_mutex); + return -ENOMEM; + } + filter = system->filter; + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) { + + if (!filter->preds) { + kfree(system->filter); + system->filter = NULL; mutex_unlock(&filter_mutex); return -ENOMEM; } } - if (system->n_preds == MAX_FILTER_PRED) { + if (filter->n_preds == MAX_FILTER_PRED) { mutex_unlock(&filter_mutex); return -ENOSPC; } - system->preds[system->n_preds] = pred; - system->n_preds++; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system, err = __filter_add_pred(call, pred); if (err == -ENOMEM) { - system->preds[system->n_preds] = NULL; - system->n_preds--; + filter->preds[filter->n_preds] = NULL; + filter->n_preds--; mutex_unlock(&filter_mutex); return err; } -- cgit v1.2.3 From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:53 -0500 Subject: tracing/filters: distinguish between signed and unsigned fields The new filter comparison ops need to be able to distinguish between signed and unsigned field types, so add an is_signed flag/param to the event field struct/trace_define_fields(). Also define a simple macro, is_signed_type() to determine the signedness at compile time, used in the trace macros. If the is_signed_type() macro won't work with a specific type, a new slightly modified version of TRACE_FIELD() called TRACE_FIELD_SIGN(), allows the signedness to be set explicitly. [ Impact: extend trace-filter code for new feature ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905893.6416.120.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_event_types.h | 4 ++-- kernel/trace/trace_events.c | 3 ++- kernel/trace/trace_export.c | 29 ++++++++++++++++++++++------- 4 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1fb7d6ccadf..866d0108fd2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -729,6 +729,7 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; struct event_filter { diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index cfcecc4fd86..5e32e375134 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1cd1f37373d..bbbea747937 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 0cb1a142c74..d06cf898dc8 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -50,6 +50,9 @@ extern void __bad_type_size(void); if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ if (ret) \ return ret; @@ -173,11 +188,11 @@ ftrace_define_fields_##call(void) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ -- cgit v1.2.3 From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:59 -0500 Subject: tracing/filters: a better event parser Replace the current event parser hack with a better one. Filters are no longer specified predicate by predicate, but all at once and can use parens and any of the following operators: numeric fields: ==, !=, <, <=, >, >= string fields: ==, != predicates can be combined with the logical operators: &&, || examples: "common_preempt_count > 4" > filter "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter If there was an error, the erroneous string along with an error message can be seen by looking at the filter e.g.: ((sig >= 10 && sig < 15) || dsig == 17) && comm != bash ^ parse_error: Field not found Currently the caret for an error always appears at the beginning of the filter; a real position should be used, but the error message should be useful even without it. To clear a filter, '0' can be written to the filter file. Filters can also be set or cleared for a complete subsystem by writing the same filter as would be written to an individual event to the filter file at the root of the subsytem. Note however, that if any event in the subsystem lacks a field specified in the filter being set, the set will fail and all filters in the subsytem are automatically cleared. This change from the previous version was made because using only the fields that happen to exist for a given event would most likely result in a meaningless filter. Because the logical operators are now implemented as predicates, the maximum number of predicates in a filter was increased from 8 to 16. [ Impact: add new, extended trace-filter implementation ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905899.6416.121.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 66 ++- kernel/trace/trace_events.c | 86 ++- kernel/trace/trace_events_filter.c | 1020 ++++++++++++++++++++++++++++-------- 3 files changed, 883 insertions(+), 289 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 866d0108fd2..7736fe8c1b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -735,6 +735,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; struct filter_pred **preds; + char *filter_string; }; struct event_subsystem { @@ -746,7 +747,8 @@ struct event_subsystem { struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; @@ -756,23 +758,18 @@ struct filter_pred { char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_disable_preds(struct ftrace_event_call *call); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern void filter_print_subsystem_preds(struct event_subsystem *system, +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbbea747937..f789ca540fe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_disable_preds(call); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } - - filter_free_pred(pred); *ppos += cnt; @@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_subsystem_preds(system, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->filter = NULL; + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); - if (!entry) + if (!entry) { + kfree(system->filter); + system->filter = NULL; pr_warning("Could not create debugfs " "'%s/filter' entry\n", name); + } return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1e861eca3d0..f49486687ee 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -29,51 +29,130 @@ static DEFINE_MUTEX(filter_mutex); -static int filter_pred_64(struct filter_pred *pred, void *event) +enum filter_op_ids { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_32(struct filter_pred *pred, void *event) -{ - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_16(struct filter_pred *pred, void *event) + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 && val2; } -static int filter_pred_8(struct filter_pred *pred, void *event) +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 || val2; } -static int filter_pred_string(struct filter_pred *pred, void *event) +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -static int filter_pred_none(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) { return 0; } @@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event) int filter_match_preds(struct ftrace_event_call *call, void *rec) { struct event_filter *filter = call->filter; - int i, matched, and_failed = 0; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int i; for (i = 0; i < filter->n_preds; i++) { pred = filter->preds[i]; - if (and_failed && !pred->or) + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; } - if (and_failed) - return 0; - - return 1; + return stack[--top]; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct event_filter *filter, - struct trace_seq *s) +static void parse_error(struct filter_parse_state *ps, int err, int pos) { - struct filter_pred *pred; - char *field_name; - int i; + ps->lasterr = err; + ps->lasterr_pos = pos; +} - if (!filter || !filter->n_preds) { - trace_seq_printf(s, "none\n"); +static void remove_filter_string(struct event_filter *filter) +{ + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return; - } - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_len) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); } -void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { + struct event_filter *filter = call->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(call->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } -void filter_print_subsystem_preds(struct event_subsystem *system, +void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { + struct event_filter *filter = system->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(system->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } @@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; @@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest, filter_pred_fn_t fn) { *dest = *src; - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + } dest->fn = fn; return 0; } -static void __filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; @@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void filter_disable_preds(struct ftrace_event_call *call) -{ - mutex_lock(&filter_mutex); - __filter_disable_preds(call); - mutex_unlock(&filter_mutex); -} - int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -258,48 +386,43 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void __filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (filter && filter->n_preds) { + if (filter->n_preds) { for (i = 0; i < filter->n_preds; i++) filter_free_pred(filter->preds[i]); kfree(filter->preds); - kfree(filter); - system->filter = NULL; + filter->preds = NULL; + filter->n_preds = 0; } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; - if (!strcmp(call->system, system->name)) - __filter_disable_preds(call); + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); + } } } -void filter_free_subsystem_preds(struct event_subsystem *system) -{ - mutex_lock(&filter_mutex); - __filter_free_subsystem_preds(system); - mutex_unlock(&filter_mutex); -} - -static int filter_add_pred_fn(struct ftrace_event_call *call, +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { struct event_filter *filter = call->filter; int idx, err; - if (filter->n_preds && !pred->compound) - __filter_disable_preds(call); - - if (filter->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; + } idx = filter->n_preds; filter_clear_pred(filter->preds[idx]); @@ -321,94 +444,132 @@ static int is_string_field(const char *type) return 0; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } + field = find_event_field(call, pred->field_name); - if (!field) + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return -EINVAL; + } - pred->fn = filter_pred_none; pred->offset = field->offset; + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + if (is_string_field(field->type)) { fn = filter_pred_string; pred->str_len = field->size; - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; + } pred->val = val; } - switch (field->size) { - case 8: - fn = filter_pred_64; - break; - case 4: - fn = filter_pred_32; - break; - case 2: - fn = filter_pred_16; - break; - case 1: - fn = filter_pred_8; - break; - default: + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - return filter_add_pred_fn(call, pred, fn); -} - -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) -{ - int err; - - mutex_lock(&filter_mutex); - err = __filter_add_pred(call, pred); - mutex_unlock(&filter_mutex); + if (pred->op == OP_NE) + pred->not = 1; - return err; + return filter_add_pred_fn(ps, call, pred, fn); } -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; - mutex_lock(&filter_mutex); - - if (filter && filter->n_preds && !pred->compound) { - __filter_free_subsystem_preds(system); - filter = NULL; - } - - if (!filter) { - system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!system->filter) { - mutex_unlock(&filter_mutex); - return -ENOMEM; - } - filter = system->filter; + if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!filter->preds) { - kfree(system->filter); - system->filter = NULL; - mutex_unlock(&filter_mutex); + if (!filter->preds) return -ENOMEM; - } } if (filter->n_preds == MAX_FILTER_PRED) { - mutex_unlock(&filter_mutex); + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - err = __filter_add_pred(call, pred); - if (err == -ENOMEM) { - filter->preds[filter->n_preds] = NULL; - filter->n_preds--; - mutex_unlock(&filter_mutex); + err = filter_add_pred(ps, call, pred); + if (err) { + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return err; } + replace_filter_string(call->filter, filter_string); } - mutex_unlock(&filter_mutex); + return 0; +} + +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); + + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } return 0; } -/* - * The filter format can be - * - 0, which means remove all filter preds - * - [||/&&] ==/!= - */ -int filter_parse(char **pbuf, struct filter_pred *pred) -{ - char *tok, *val_str = NULL; - int tok_n = 0; - - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; - continue; +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; + } + + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - tok_n = 2; + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); continue; } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; } - tok_n = 3; continue; } - if (tok_n == 3) { - val_str = tok; - break; /* done */ + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; } + n_normal_preds++; } - if (!val_str || !strlen(val_str) - || strlen(val_str) >= MAX_FILTER_STR_VAL) { - pred->field_name = NULL; + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; } - strcpy(pred->str_val, val_str); - pred->str_len = strlen(val_str); + return 0; +} - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) - return -ENOMEM; +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + return -EINVAL; + } + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + continue; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; + } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + } return 0; } +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} -- cgit v1.2.3 From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Apr 2009 13:51:39 +0200 Subject: tracing: fix build failure on s390 "tracing: create automated trace defines" causes this compile error on s390, as reported by Sachin Sant against linux-next: kernel/built-in.o: In function `__do_softirq': (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry' This happens because the definitions of the softirq tracepoints were moved from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support generic hardirqs handle.c doesn't get compiled and the definitions are missing. So move the tracepoints to softirq.c again. [ Impact: fix build failure on s390 ] Reported-by: Sachin Sant Signed-off-by: Heiko Carstens Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 -- kernel/softirq.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 37c63633e78..e68bb5aebe0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,8 +18,6 @@ #include #include #include - -#define CREATE_TRACE_POINTS #include #include "internals.h" diff --git a/kernel/softirq.c b/kernel/softirq.c index 7ab9dfd8d08..d4ba347a872 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include -- cgit v1.2.3 From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2009 15:01:38 +0100 Subject: sched: Document memory barriers implied by sleep/wake-up primitives Add a section to the memory barriers document to note the implied memory barriers of sleep primitives (set_current_state() and wrappers) and wake-up primitives (wake_up() and co.). Also extend the in-code comments on the wake_up() functions to note these implied barriers. [ Impact: add documentation ] Signed-off-by: David Howells Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..fd0c2cee3f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { -- cgit v1.2.3 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 68 ++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09396098dd0..582108addef 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { return NULL; } @@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter, counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_stopped = ctx->time; - counter->hw_ops->disable(counter); + counter->pmu->disable(counter); counter->oncpu = -1; if (!is_software_counter(counter)) @@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter, */ smp_wmb(); - if (counter->hw_ops->enable(counter)) { + if (counter->pmu->enable(counter)) { counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; return -EAGAIN; @@ -1096,7 +1095,7 @@ static void __read(void *info) local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); - counter->hw_ops->read(counter); + counter->pmu->read(counter); update_counter_times(counter); local_irq_restore(flags); } @@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter, leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) - sub->hw_ops->read(sub); + sub->pmu->read(sub); group_entry.event = sub->hw_event.config; group_entry.counter = atomic64_read(&sub->count); @@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) struct pt_regs *regs; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->hw_ops->read(counter); + counter->pmu->read(counter); regs = get_irq_regs(); /* @@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_generic = { +static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, @@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) cpu_clock_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_clock = { +static const struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_counter_enable, .disable = cpu_clock_perf_counter_disable, .read = cpu_clock_perf_counter_read, @@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, time); } -static const struct hw_perf_counter_ops perf_ops_task_clock = { +static const struct pmu perf_ops_task_clock = { .enable = task_clock_perf_counter_enable, .disable = task_clock_perf_counter_disable, .read = task_clock_perf_counter_read, @@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { +static const struct pmu perf_ops_cpu_migrations = { .enable = cpu_migrations_perf_counter_enable, .disable = cpu_migrations_perf_counter_disable, .read = cpu_migrations_perf_counter_read, @@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) ftrace_profile_disable(perf_event_id(&counter->hw_event)); } -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { int event_id = perf_event_id(&counter->hw_event); int ret; @@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter) return &perf_ops_generic; } #else -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { return NULL; } #endif -static const struct hw_perf_counter_ops * -sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; - const struct hw_perf_counter_ops *hw_ops = NULL; + const struct pmu *pmu = NULL; struct hw_perf_counter *hwc = &counter->hw; /* @@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter) * use the cpu_clock counter instead. */ if (counter->ctx->task) - hw_ops = &perf_ops_task_clock; + pmu = &perf_ops_task_clock; else - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_generic; + pmu = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_cpu_migrations; + pmu = &perf_ops_cpu_migrations; break; } - if (hw_ops) + if (pmu) hwc->irq_period = hw_event->irq_period; - return hw_ops; + return pmu; } /* @@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, struct perf_counter *group_leader, gfp_t gfpflags) { - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; struct perf_counter *counter; long err; @@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; - counter->hw_ops = NULL; + counter->pmu = NULL; counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; - hw_ops = NULL; + pmu = NULL; if (perf_event_raw(hw_event)) { - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); goto done; } switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); break; case PERF_TYPE_SOFTWARE: - hw_ops = sw_perf_counter_init(counter); + pmu = sw_perf_counter_init(counter); break; case PERF_TYPE_TRACEPOINT: - hw_ops = tp_perf_counter_init(counter); + pmu = tp_perf_counter_init(counter); break; } done: err = 0; - if (!hw_ops) + if (!pmu) err = -EINVAL; - else if (IS_ERR(hw_ops)) - err = PTR_ERR(hw_ops); + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); if (err) { kfree(counter); return ERR_PTR(err); } - counter->hw_ops = hw_ops; + counter->pmu = pmu; if (counter->hw_event.mmap) atomic_inc(&nr_mmap_tracking); -- cgit v1.2.3 From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 14:52:50 +0200 Subject: perf_counter: add/update copyrights Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 582108addef..a95a171e608 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1,9 +1,9 @@ /* * Performance counter core code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licensing details see kernel-base/COPYING */ -- cgit v1.2.3 From 23b94b967f118bef941369238f33c8140be46539 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 29 Apr 2009 21:54:51 +0100 Subject: locking, rtmutex.c: Documentation cleanup Two minor updates on functions documentation: - Updated documentation for function rt_mutex_unlock(), which contained an incorrect name - Removed extra '*' from comment in function rt_mutex_destroy() [ Impact: cleanup ] Signed-off-by: Luis Henriques Cc: Steven Rostedt LKML-Reference: <20090429205451.GA23154@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..013882e8349 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * -- cgit v1.2.3 From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2009 13:45:05 +0100 Subject: SELinux: Don't flush inherited SIGKILL during execve() Don't flush inherited SIGKILL during execve() in SELinux's post cred commit hook. This isn't really a security problem: if the SIGKILL came before the credentials were changed, then we were right to receive it at the time, and should honour it; if it came after the creds were changed, then we definitely should honour it; and in any case, all that will happen is that the process will be scrapped before it ever returns to userspace. Signed-off-by: David Howells Signed-off-by: Oleg Nesterov Signed-off-by: James Morris --- kernel/signal.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1c8814481a1..f93efec14ff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -238,14 +238,19 @@ void flush_sigqueue(struct sigpending *queue) /* * Flush all pending signals for a task. */ +void __flush_signals(struct task_struct *t) +{ + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); +} + void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); + __flush_signals(t); spin_unlock_irqrestore(&t->sighand->siglock, flags); } -- cgit v1.2.3 From c5dd016cdf0a040e1de0b691e274fbfe642b2cdc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Apr 2009 09:48:16 +1000 Subject: perf_counter: update copyright notice This adds my name to the list of copyright holders on the core perf_counter.c, since I have contributed a significant amount of the code in there. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter LKML-Reference: <18936.59200.888049.746658@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a95a171e608..75f2b6c8239 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ -- cgit v1.2.3 From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 29 Apr 2009 15:59:58 -0700 Subject: mutex: add atomic_dec_and_mutex_lock(), fix include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here uninline it. [ Impact: clean up and uninline, address compiler warning ] Signed-off-by: Andrew Morton Cc: Al Viro Cc: Christoph Hellwig Cc: Eric Paris Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org> Signed-off-by: Ingo Molnar --- kernel/mutex.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..e2d25e9e62a 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); -- cgit v1.2.3 From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 20 Apr 2009 22:22:22 -0700 Subject: futex: remove FUTEX_REQUEUE_PI (non CMP) The new requeue PI futex op codes were modeled after the existing FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls. I was unaware at the time that FUTEX_REQUEUE was only around for compatibility reasons and shouldn't be used in new code. Ulrich Drepper elaborates on this in his Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf. The deprecated call doesn't catch changes to the futex corresponding to the destination futex which can lead to deadlock. Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API. This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12 from 13. Since my test case is the only known user of this API, I felt this was the right thing to do, rather than leave a hole in the enumeration. I chose to continue using the _CMP_ modifier in the OP code to make it explicit to the user that the test is being done. Builds, boots, and ran several hundred iterations requeue_pi.c. Signed-off-by: Darren Hart LKML-Reference: <49ED580E.1050502@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6d2daa46f9f..aec8bf89bf4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, clockrt, uaddr2); break; - case FUTEX_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); - break; case FUTEX_CMP_REQUEUE_PI: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 1); @@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v1.2.3 From 30b4ae8a4498543863501f707879b7220b649602 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:01 +0000 Subject: signals: split do_tkill Split out the code from do_tkill to make it reusable by the follow up patch which implements sys_rt_tgsigqueueinfo Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov --- kernel/signal.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..56d27acad87 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -static int do_tkill(pid_t tgid, pid_t pid, int sig) +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { - int error; - struct siginfo info; struct task_struct *p; unsigned long flags; - - error = -ESRCH; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, &info, p); + error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. @@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * signal is private anyway. */ if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); + error = specific_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) return error; } +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return do_send_specific(tgid, pid, sig, &info); +} + /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread -- cgit v1.2.3 From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:06 +0000 Subject: signals: implement sys_rt_tgsigqueueinfo sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is missing a thread directed counterpart. Such an interface is important for migrating applications from other OSes which have the per thread delivery implemented. Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: Ulrich Drepper --- kernel/compat.c | 11 +++++++++++ kernel/signal.c | 26 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 42d56544460..f6c204f07ea 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 56d27acad87..f79b3b9f837 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return kill_proc_info(sig, &info, pid); } +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + /* Not even root can pretend to send signals from the kernel. + Nor can they impersonate a kill(), which adds source info. */ + if (info->si_code >= 0) + return -EPERM; + info->si_signo = sig; + + return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; -- cgit v1.2.3 From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 29 Apr 2009 18:01:23 +0200 Subject: do_wait: do take security_task_wait() into account I was never able to understand what should we actually do when security_task_wait() fails, but the current code doesn't look right. If ->task_wait() returns the error, we update *notask_error correctly. But then we either reap the child (despite the fact this was forbidden) or clear *notask_error (and hide the securiy policy problems). This patch assumes that "stolen by ptrace" doesn't matter. If selinux denies the child we should ignore it but make sure we report -EACCESS instead of -ECHLD if there are no other eligible children. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: James Morris --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c..d2e8239ea18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, */ if (*notask_error) *notask_error = ret; + return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { -- cgit v1.2.3 From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:16 +0200 Subject: perf_counter: fix race in perf_output_* When two (or more) contexts output to the same buffer, it is possible to observe half written output. Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing perf_counter_overflow(). If CPU1 does a wakeup and exposes head to user-space, then CPU2 can observe the data CPU0 is still writing. [ Impact: fix occasionally corrupted profiling records ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.007821627@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 130 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 101 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75f2b6c8239..8660ae57953 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; struct perf_mmap_data *data; - unsigned int events; + unsigned int events = POLL_HUP; rcu_read_lock(); data = rcu_dereference(counter->data); if (data) - events = atomic_xchg(&data->wakeup, 0); - else - events = POLL_HUP; + events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = { void perf_counter_wakeup(struct perf_counter *counter) { - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) { - atomic_set(&data->wakeup, POLL_IN); - /* - * Ensure all data writes are issued before updating the - * user-space data head information. The matching rmb() - * will be in userspace after reading this value. - */ - smp_wmb(); - data->user_page->data_head = atomic_read(&data->head); - } - rcu_read_unlock(); - wake_up_all(&counter->waitq); if (counter->pending_kill) { @@ -1721,10 +1703,14 @@ struct perf_output_handle { int wakeup; int nmi; int overflow; + int locked; + unsigned long flags; }; -static inline void __perf_output_wakeup(struct perf_output_handle *handle) +static void perf_output_wakeup(struct perf_output_handle *handle) { + atomic_set(&handle->data->poll, POLL_IN); + if (handle->nmi) { handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, @@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) perf_counter_wakeup(handle->counter); } +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int head, cpu; + + if (handle->wakeup) + data->wakeup_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_xchg(&data->wakeup_head, 0))) { + data->user_page->data_head = head; + handle->wakeup = 1; + } + + /* + * NMI can happen here, which means we can miss a wakeup_head update. + */ + + cpu = atomic_xchg(&data->lock, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_read(&data->wakeup_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + goto again; + } + + if (handle->wakeup) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int overflow) @@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; + handle->data = data; handle->counter = counter; handle->nmi = nmi; handle->overflow = overflow; @@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data->nr_pages) goto fail; + perf_output_lock(handle); + do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->data = data; handle->offset = offset; handle->head = head; handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - __perf_output_wakeup(handle); + perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - int wakeup_events = handle->counter->hw_event.wakeup_events; + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->hw_event.wakeup_events; if (handle->overflow && wakeup_events) { - int events = atomic_inc_return(&handle->data->events); + int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &handle->data->events); - __perf_output_wakeup(handle); + atomic_sub(wakeup_events, &data->events); + handle->wakeup = 1; } - } else if (handle->wakeup) - __perf_output_wakeup(handle); + } + + perf_output_unlock(handle); rcu_read_unlock(); } -- cgit v1.2.3 From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Apr 2009 01:17:50 -0700 Subject: x86/irq: use move_irq_desc() in create_irq_nr() move_irq_desc() will try to move irq_desc to the home node if the allocated one is not correct, in create_irq_nr(). ( This can happen on devices that are on different nodes that are using MSI, when drivers are loaded and unloaded randomly. ) v2: fix non-smp build v3: add NUMA_IRQ_DESC to eliminate #ifdefs [ Impact: improve irq descriptor locality on NUMA systems ] Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Suresh Siddha Cc: "Eric W. Biederman" Cc: Rusty Russell LKML-Reference: <49F95EAE.2050903@kernel.org> Signed-off-by: Ingo Molnar --- kernel/irq/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2f065277f8e..7d047808419 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o -- cgit v1.2.3 From a25cbd045a2ffc42787d4dbcbb9c7118f5f42732 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 1 May 2009 14:45:46 +0900 Subject: clocksource: setup mult_orig in clocksource_enable() Setup clocksource mult_orig in clocksource_enable(). Clocksource drivers can save power by using keeping the device clock disabled while the clocksource is unused. In practice this means that the enable() and disable() callbacks perform clk_enable() and clk_disable(). The enable() callback may also use clk_get_rate() to get the clock rate from the clock framework. This information can then be used to calculate the shift and mult variables. Currently the mult_orig variable is setup from mult at registration time only. This is conflicting with the above case since the clock is disabled and the mult variable is not yet calculated at the time of registration. Moving the mult_orig setup code to clocksource_enable() allows us to both handle the common case with no enable() callback and the mult-changed-after-enable() case. [ Impact: allow dynamic clock source usage ] Signed-off-by: Magnus Damm LKML-Reference: <20090501054546.8193.10688.sendpatchset@rx1.opensource.se> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ecfd7b5187e..80189f6f1c5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; - /* save mult_orig on registration */ - c->mult_orig = c->mult; - spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) -- cgit v1.2.3 From 7d27558c4138ac6b3684dea35c2f4379b940a7dd Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 1 May 2009 13:10:26 -0700 Subject: timekeeping: create arch_gettimeoffset infrastructure Some arches don't supply their own clocksource. This is mainly the case in architectures that get their inter-tick times by reading the counter on their interval timer. Since these timers wrap every tick, they're not really useful as clocksources. Wrapping them to act like one is possible but not very efficient. So we provide a callout these arches can implement for use with the jiffies clocksource to provide finer then tick granular time. [ Impact: ease the migration to generic time keeping ] Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..e97c50f8458 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -77,6 +77,10 @@ static void clocksource_forward_now(void) clock->cycle_last = cycle_now; nsec = cyc2ns(clock, cycle_delta); + + /* If arch requires, add in gettimeoffset() */ + nsec += arch_gettimeoffset(); + timespec_add_ns(&xtime, nsec); nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; @@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts) /* convert to nanoseconds: */ nsecs = cyc2ns(clock, cycle_delta); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + } while (read_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); -- cgit v1.2.3 From b82914ce33146186d554b0f5c41e4e13693614ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 18:54:32 +0200 Subject: perf_counter: round-robin per-CPU counters too This used to be unstable when we had the rq->lock dependencies, but now that they are that of the past we can turn on percpu counter RR too. [ Impact: handle counter over-commit for per-CPU counters too ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8660ae57953..b9679c36bcc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1069,18 +1069,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &curr->perf_counter_ctx; - const int rotate_percpu = 0; - if (rotate_percpu) - perf_counter_cpu_sched_out(cpuctx); + perf_counter_cpu_sched_out(cpuctx); perf_counter_task_sched_out(curr, cpu); - if (rotate_percpu) - rotate_ctx(&cpuctx->ctx); + rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); - if (rotate_percpu) - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } -- cgit v1.2.3 From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:13:30 +0200 Subject: perf_counter: initialize the per-cpu context earlier percpu scheduling for perfcounters wants to take the context lock, but that lock first needs to be initialized. Currently it is an early_initcall() - but that is too late, the task tick runs much sooner than that. Call it explicitly from the scheduler init sequence instead. [ Impact: fix access-before-init crash ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 5 +---- kernel/sched.c | 5 ++++- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b9679c36bcc..fcdafa234a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, }; -static int __init perf_counter_init(void) +void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); - - return 0; } -early_initcall(perf_counter_init); static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) { diff --git a/kernel/sched.c b/kernel/sched.c index 2f600e30dcf..a728976a3a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -8996,7 +8997,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9097,6 +9098,8 @@ void __init sched_init(void) alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } -- cgit v1.2.3 From 1dce8d99b85aba6eddb8b8260baea944922e6fe7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:23:18 +0200 Subject: perf_counter: convert perf_resource_mutex to a spinlock Now percpu counters can be initialized very early. But the init sequence uses mutex_lock(). Fortunately, perf_resource_mutex should be a spinlock anyway, so convert it. [ Impact: fix crash due to early init mutex use ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fcdafa234a5..5f86a1156c9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,9 +46,9 @@ static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ /* - * Mutex for (sysadmin-configurable) counter reservations: + * Lock for (sysadmin-configurable) counter reservations: */ -static DEFINE_MUTEX(perf_resource_mutex); +static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: @@ -3207,9 +3207,9 @@ static void __cpuinit perf_counter_init_cpu(int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); __perf_counter_init_context(&cpuctx->ctx, NULL); - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); hw_perf_counter_setup(cpu); } @@ -3292,7 +3292,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, if (val > perf_max_counters) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); @@ -3302,7 +3302,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, cpuctx->max_pertask = mpt; spin_unlock_irq(&cpuctx->ctx.lock); } - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; } @@ -3324,9 +3324,9 @@ perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) if (val > 1) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_overcommit = val; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; } -- cgit v1.2.3 From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 01:15:24 -0400 Subject: ring-buffer: export symbols I'm adding a module to do a series of tests on the ring buffer as well as benchmarks. This module needs to have more of the ring buffer API exported. There's nothing wrong with reading the ring buffer from a module. [ Impact: allow modules to read pages from the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4cc59040eb..3e86da9b2a0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, -- cgit v1.2.3 From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 13:43:37 -0400 Subject: ring-buffer: add counters for commit overrun and nmi dropped entries The WARN_ON in the ring buffer when a commit is preempted and the buffer is filled by preceding writes can happen in normal operations. The WARN_ON makes it look like a bug, not to mention, because it does not stop tracing and calls printk which can also recurse, this is prone to deadlock (the WARN_ON is not in a position to recurse). This patch removes the WARN_ON and replaces it with a counter that can be retrieved by a tracer. This counter is called commit_overrun. While at it, I added a nmi_dropped counter to count any time an NMI entry is dropped because the NMI could not take the spinlock. [ Impact: prevent deadlock by printing normal case warning ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 52 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3e86da9b2a0..26e1359fe19 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; unsigned long entries; u64 write_stamp; @@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * simply fail. */ if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; goto out_reset; + } } else __raw_spin_lock(&cpu_buffer->lock); @@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - /* This can easily happen on small ring buffers */ - WARN_ON_ONCE(buffer->pages > 2); + cpu_buffer->commit_overrun++; goto out_reset; } @@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); +/** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer @@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; -- cgit v1.2.3 From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 18:03:45 -0400 Subject: tracing: export stats of ring buffers to userspace This patch adds stats to the ftrace ring buffers: # cat /debugfs/tracing/per_cpu/cpu0/stats entries: 42360 overrun: 30509326 commit overrun: 0 nmi dropped: 0 Where entries are the total number of data entries in the buffer. overrun is the number of entries not consumed and were overwritten by the writer. commit overrun is the number of entries dropped due to nested writers wrapping the buffer before the initial writer finished the commit. nmi dropped is the number of entries dropped due to the ring buffer lock being held when an nmi was going to write to the ring buffer. Note, this field will be meaningless and will go away when the ring buffer becomes lockless. [ Impact: let userspace know what is happening in the ring buffers ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5427e0fc98..74df029056b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("trace_pipe_raw", 0444, d_cpu, (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST -- cgit v1.2.3 From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:21 +0200 Subject: sched: rt: document the risk of small values in the bandwidth settings Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for (!RT_GROUP) since the root group always has some RT tasks in it. Further, update the documentation to inspire clue. [ Impact: exclude corner-case sysctl_sched_rt_runtime value ] Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: <20090505155436.863098054@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 54d67b94f1a..2a43a581ead 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.2.3 From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:22 +0200 Subject: perf_counter: uncouple data_head updates from wakeups Keep data_head up-to-date irrespective of notifications. This fixes the case where you disable a counter and don't get a notification for the last few pending events, and it also allows polling usage. [ Impact: increase precision of perfcounter mmap-ed fields ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155436.925084300@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5f86a1156c9..ba5e921e1f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1696,7 +1696,6 @@ struct perf_output_handle { struct perf_mmap_data *data; unsigned int offset; unsigned int head; - int wakeup; int nmi; int overflow; int locked; @@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle) struct perf_mmap_data *data = handle->data; int head, cpu; - if (handle->wakeup) - data->wakeup_head = data->head; + data->done_head = data->head; if (!handle->locked) goto out; @@ -1764,13 +1762,11 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->wakeup_head, 0))) { + while ((head = atomic_xchg(&data->done_head, 0))) data->user_page->data_head = head; - handle->wakeup = 1; - } /* - * NMI can happen here, which means we can miss a wakeup_head update. + * NMI can happen here, which means we can miss a done_head update. */ cpu = atomic_xchg(&data->lock, 0); @@ -1779,7 +1775,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->wakeup_head))) { + if (unlikely(atomic_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -1789,7 +1785,7 @@ again: goto again; } - if (handle->wakeup) + if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: local_irq_restore(handle->flags); @@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle, handle->offset = offset; handle->head = head; - handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + atomic_set(&data->wakeup, 1); return 0; @@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle) int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &data->events); - handle->wakeup = 1; + atomic_set(&data->wakeup, 1); } } -- cgit v1.2.3 From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:23 +0200 Subject: perf_counter: add ioctl(PERF_COUNTER_IOC_RESET) Provide a way to reset an existing counter - this eases PAPI libraries around perfcounters. Similar to read() it doesn't collapse pending child counters. [ Impact: new perfcounter fd ioctl method to reset counters ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155437.022272933@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ba5e921e1f3..6e6834e0587 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static void perf_counter_reset(struct perf_counter *counter) +{ + atomic_set(&counter->count, 0); +} + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: perf_counter_refresh(counter, arg); break; + case PERF_COUNTER_IOC_RESET: + perf_counter_reset(counter); + break; default: err = -ENOTTY; } -- cgit v1.2.3 From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:24 +0200 Subject: perf_counter: provide an mlock threshold Provide a threshold to relax the mlock accounting, increasing usability. Each counter gets perf_counter_mlock_kb for free. [ Impact: allow more mmap buffering ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.112113632@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 15 +++++++++++---- kernel/sysctl.c | 8 ++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6e6834e0587..2d134273830 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; + vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } @@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; unsigned long locked, lock_limit; int ret = 0; + long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - locked = vma->vm_mm->locked_vm; - locked += nr_pages + 1; + extra = nr_pages /* + 1 only account the data pages */; + extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + if (extra < 0) + extra = 0; + + locked = vma->vm_mm->locked_vm + extra; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; @@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); - vma->vm_mm->locked_vm += nr_pages + 1; + vma->vm_mm->locked_vm += extra; + counter->data->nr_locked = extra; unlock: mutex_unlock(&counter->mmap_mutex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8203d70928d..3b05c2b088d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_mlock_kb", + .data = &sysctl_perf_counter_mlock, + .maxlen = sizeof(sysctl_perf_counter_mlock), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3 From 22c1558e51c210787c6cf75d8905246fc91ec030 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:25 +0200 Subject: perf_counter: fix the output lock Use -1 instead of 0 as unlocked, since 0 is a valid cpu number. ( This is not an issue right now but will be once we allow multiple counters to output to the same mmap area. ) [ Impact: prepare code for multi-counter profile output ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.232686598@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2d134273830..c881afef997 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1409,6 +1409,7 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) } data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); rcu_assign_pointer(counter->data, data); @@ -1755,7 +1756,7 @@ static void perf_output_lock(struct perf_output_handle *handle) if (in_nmi() && atomic_read(&data->lock) == cpu) return; - while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); handle->locked = 1; @@ -1784,7 +1785,7 @@ again: * NMI can happen here, which means we can miss a done_head update. */ - cpu = atomic_xchg(&data->lock, 0); + cpu = atomic_xchg(&data->lock, -1); WARN_ON_ONCE(cpu != smp_processor_id()); /* @@ -1794,7 +1795,7 @@ again: /* * Since we had it locked, we can lock it again. */ - while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); goto again; -- cgit v1.2.3 From 2023b359214bbc5bad31571cf50d7fb83b535c0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:26 +0200 Subject: perf_counter: inheritable sample counters Redirect the output to the parent counter and put in some sanity checks. [ Impact: new perfcounter feature - inherited sampling counters ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.331556171@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c881afef997..60e55f0b48f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -738,10 +738,18 @@ static void perf_counter_enable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } -static void perf_counter_refresh(struct perf_counter *counter, int refresh) +static int perf_counter_refresh(struct perf_counter *counter, int refresh) { + /* + * not supported on inherited counters + */ + if (counter->hw_event.inherit) + return -EINVAL; + atomic_add(refresh, &counter->event_limit); perf_counter_enable(counter); + + return 0; } /* @@ -1307,7 +1315,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) perf_counter_disable_family(counter); break; case PERF_COUNTER_IOC_REFRESH: - perf_counter_refresh(counter, arg); + err = perf_counter_refresh(counter, arg); break; case PERF_COUNTER_IOC_RESET: perf_counter_reset(counter); @@ -1814,6 +1822,12 @@ static int perf_output_begin(struct perf_output_handle *handle, struct perf_mmap_data *data; unsigned int offset, head; + /* + * For inherited counters we send all the output towards the parent. + */ + if (counter->parent) + counter = counter->parent; + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) @@ -1995,6 +2009,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_ADDR) perf_output_put(&handle, addr); + /* + * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. + */ if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2281,6 +2298,11 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + /* + * XXX event_limit might not quite work as expected on inherited + * counters + */ + counter->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&counter->event_limit)) { ret = 1; @@ -2801,6 +2823,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; + /* + * we currently do not support PERF_RECORD_GROUP on inherited counters + */ + if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) + goto done; + if (perf_event_raw(hw_event)) { pmu = hw_perf_counter_init(counter); goto done; -- cgit v1.2.3 From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 20:49:44 -0400 Subject: ring-buffer: convert cpu buffer entries to local_t The entries counter in cpu buffer is not atomic. It can be updated by other interrupts or from another CPU (readers). But making entries into "atomic_t" causes an atomic operation that can hurt performance. Instead we convert it to a local_t that will increment a counter with a local CPU atomic operation (if the arch supports it). Instead of fighting with readers and overwrites that decrement the counter, I added a "read" counter. Every time a reader reads an entry it is incremented. We already have a overrun counter and with that, the entries counter and the read counter, we can calculate the total number of entries in the buffer with: (entries - overrun) - read As long as the total number of entries in the ring buffer is less than the word size, this will work. But since the entries counter was previously a long, this is no different than what we had before. Thanks to Andrew Morton for pointing out in the first version that atomic_t does not replace unsigned long. I switched to atomic_long_t even though it is signed. A negative count is most likely a bug. [ Impact: keep accurate count of cpu buffer entries ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 26e1359fe19..c792ea893b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,8 @@ struct ring_buffer_per_cpu { unsigned long nmi_dropped; unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; - cpu_buffer->entries--; } } @@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * The commit is still visible by the reader, so we * must increment entries. */ - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); out: /* * If a write came in and pushed the tail page @@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX || rb_discarded_event(event)) - cpu_buffer->entries--; + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->nmi_dropped = 0; cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, /* Only count data entries */ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; - cpu_buffer->entries--; + cpu_buffer->read++; } __raw_spin_unlock(&cpu_buffer->lock); } -- cgit v1.2.3 From 41c51c98f588edcdf6141cff1895df738e03ddd4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 May 2009 23:11:18 +0200 Subject: rcu: rcu_sched_grace_period(): kill the bogus flush_signals() As a kernel thread, rcu_sched_grace_period() runs with all signals ignored. It can never receive a signal even if it sleeps in TASK_INTERRUPTIBLE, it needs the explicit allow_signal() to be visible for signals. [ Impact: reduce kernel size, remove dead code ] Signed-off-by: Oleg Nesterov Reviewed-by: Paul E. McKenney Cc: Andrew Morton LKML-Reference: <20090503211118.GA22973@redhat.com> Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4df64d..beb0e659adc 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg) rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; + ret = 0; /* unused */ __wait_event_interruptible(rcu_ctrlblk.sched_wq, rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, ret); - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); couldsleepnext = 0; } while (!kthread_should_stop()); -- cgit v1.2.3 From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 18:44:45 -0400 Subject: ring-buffer: record page entries in buffer page descriptor Currently, when the ring buffer writer overflows the buffer and must write over non consumed data, we increment the overrun counter by reading the entries on the page we are about to overwrite. This reads the entries one by one. This is not very effecient. This patch adds another entry counter into each buffer page descriptor that keeps track of the number of entries on the page. Now on overwrite, the overrun counter simply needs to add the number of entries that is on the page it is about to overwrite. [ Impact: speed up of ring buffer in overwrite mode ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c792ea893b0..342eacc4baa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -321,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->overrun++; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { /* count overflows */ - rb_update_overflow(cpu_buffer); + cpu_buffer->overrun += + local_read(&head_page->entries); rb_inc_page(cpu_buffer, &head_page); cpu_buffer->head_page = head_page; @@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; @@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; @@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; -- cgit v1.2.3 From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 19:40:05 -0400 Subject: ring-buffer: have read page swap increment counter with page entries In the swap page ring buffer code that is used by the ftrace splice code, we scan the page to increment the counter of entries read. With the number of entries already in the page we simply need to add it. [ Impact: speed up reading page from ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 342eacc4baa..9e42a742a3f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2785,28 +2785,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->read++; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; @@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; -- cgit v1.2.3 From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 20:26:54 -0400 Subject: ring-buffer: disable writers when resetting buffers As a precaution, it is best to disable writing to the ring buffers when reseting them. [ Impact: prevent weird things if write happens during reset ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9e42a742a3f..7876df00695 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); -- cgit v1.2.3 From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 19:22:53 -0400 Subject: tracing: use proper export symbol for tracing api When adding the EXPORT_SYMBOL to some of the tracing API, I accidently used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes that mistake. [ Impact: export the tracing code only for GPL modules ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 74df029056b..4164a344e72 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } -EXPORT_SYMBOL(trace_current_buffer_lock_reserve); +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } -EXPORT_SYMBOL(trace_current_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } -EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { -- cgit v1.2.3 From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 21:16:11 -0400 Subject: ring-buffer: move big if statement down In the hot path of the ring buffer "__rb_reserve_next" there's a big if statement that does not even return back to the work flow. code; if (cross to next page) { [ lots of code ] return; } more code; The condition is even the unlikely path, although we do not denote it with an unlikely because gcc is fine with it. The condition is true when the write crosses a page boundary, and we need to start at a new page. Having this if statement makes it hard to read, but calling another function to do the work is also not appropriate, because we are using a lot of variables that were set before the if statement, and we do not want to send them as parameters. This patch changes it to a goto: code; if (cross to next page) goto next_page; more code; return; next_page: [ lots of code] This makes the code easier to understand, and a bit more obvious. The output from gcc is practically identical. For some reason, gcc decided to use different registers when I switched it to a goto. But other than that, the logic is the same. [ Impact: easier to read code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 218 +++++++++++++++++++++++---------------------- 1 file changed, 111 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7876df00695..424129eb20a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; + struct buffer_page *next_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; @@ -1173,137 +1174,140 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; + if (write > BUF_PAGE_SIZE) + goto next_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; + /* We reserved something on the buffer */ - rb_inc_page(cpu_buffer, &next_page); + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; - goto out_reset; - } + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; + return event; - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + next_page: - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } - } + next_page = tail_page; - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; } + } else + __raw_spin_lock(&cpu_buffer->lock); - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } + lock_taken = true; - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + rb_inc_page(cpu_buffer, &next_page); - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; } - /* We reserved something on the buffer */ + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * The actual tail page has moved forward. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } - return event; + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); out_reset: /* reset write */ -- cgit v1.2.3 From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 22:47:18 -0400 Subject: ring-buffer: add benchmark and tester This patch adds code that can benchmark the ring buffer as well as test it. This code can be compiled into the kernel (not recommended) or as a module. A separate ring buffer is used to not interfer with other users, like ftrace. It creates a producer and a consumer (option to disable creation of the consumer) and will run for 10 seconds, then sleep for 10 seconds and then repeat. While running, the producer will write 10 byte loads into the ring buffer with just putting in the current CPU number. The reader will continually try to read the buffer. The reader will alternate from reading the buffer via event by event, or by full pages. The output is a pr_info, thus it will fill up the syslogs. Starting ring buffer hammer End ring buffer hammer Time: 9000349 (usecs) Overruns: 12578640 Read: 5358440 (by events) Entries: 0 Total: 17937080 Missed: 0 Hit: 17937080 Entries per millisec: 1993 501 ns per entry Sleeping for 10 secs Starting ring buffer hammer End ring buffer hammer Time: 9936350 (usecs) Overruns: 0 Read: 28146644 (by pages) Entries: 74 Total: 28146718 Missed: 0 Hit: 28146718 Entries per millisec: 2832 353 ns per entry Sleeping for 10 secs Time: is the time the test ran Overruns: the number of events that were overwritten and not read Read: the number of events read (either by pages or events) Entries: the number of entries left in the buffer (the by pages will only read full pages) Total: Entries + Read + Overruns Missed: the number of entries that failed to write Hit: the number of entries that were written The above example shows that it takes ~353 nanosecs per entry when there is a reader, reading by pages (and no overruns) The event by event reader slowed the producer down to 501 nanosecs. [ Impact: see how changes to the ring buffer affect stability and performance ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 ++ kernel/trace/Makefile | 1 + kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/trace/ring_buffer_benchmark.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 450d3c2cfbd..50f62a296e1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -471,6 +471,22 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index fb9d7f96489..7c34cbfff96 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000000..747244acb8f --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,379 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + if (consumer && !(++cnt % wakeup_interval)) + wake_up_process(consumer); + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= 1000000; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + do_div(time, 1000); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + avg = 1000000 / hit; + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:04 +0800 Subject: tracing/events: fix memory leak when unloading module When unloading a module, memory allocated by init_preds() and trace_define_field() is not freed. [ Impact: fix memory leak ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 18 ++++++++++++++++++ kernel/trace/trace_events_filter.c | 22 +++++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f789ca540fe..f251a150e75 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,6 +60,22 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#ifdef CONFIG_MODULES + +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +#endif /* CONFIG_MODULES */ + static void ftrace_clear_events(void) { struct ftrace_event_call *call; @@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f49486687ee..ce07b818671 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; +} + int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call) return 0; oom: - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); - kfree(call->filter); - call->filter = NULL; + destroy_preds(call); return -ENOMEM; } -- cgit v1.2.3 From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:45 +0800 Subject: tracing/events: fix concurrent access to ftrace_events list A module will add/remove its trace events when it gets loaded/unloaded, so the ftrace_events list is not "const", and concurrent access needs to be protected. This patch thus fixes races between loading/unloding modules and read 'available_events' or read/write 'set_event', etc. Below shows how to reproduce the race: # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null & # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } & After a while: BUG: unable to handle kernel paging request at 0010011c IP: [] t_next+0x1b/0x2d ... Call Trace: [] ? seq_read+0x217/0x30d [] ? seq_read+0x0/0x30d [] ? vfs_read+0x8f/0x136 [] ? sys_read+0x40/0x65 [] ? sysenter_do_call+0x12/0x36 [ Impact: fix races when concurrent accessing ftrace_events list ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Tom Zanussi Cc: Peter Zijlstra LKML-Reference: <4A00F709.3080800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_event_profile.c | 19 ++++++++++++++----- kernel/trace/trace_events.c | 20 +++++++++++--------- kernel/trace/trace_events_filter.c | 10 +++++++--- 4 files changed, 33 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7736fe8c1b7..777c6c3a0cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event, \ return match; \ } +extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 7bf2ad65eee..5b5895afecf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,21 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_enable(event); + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_disable(event); + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f251a150e75..8d579ff2361 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,7 +21,7 @@ #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); @@ -80,6 +80,7 @@ static void ftrace_clear_events(void) { struct ftrace_event_call *call; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { @@ -87,6 +88,7 @@ static void ftrace_clear_events(void) call->unregfunc(); } } + mutex_unlock(&event_mutex); } static void ftrace_event_enable_disable(struct ftrace_event_call *call, @@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } @@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = ftrace_events.next; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ce07b818671..7ac69108527 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, { struct event_filter *filter = system->filter; struct ftrace_event_call *call; + int err = 0; if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), @@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - int err; if (!call->define_fields) continue; @@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (err) { filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; + break; } replace_filter_string(call->filter, filter_string); } + mutex_unlock(&event_mutex); - return 0; + return err; } static void parse_init(struct filter_parse_state *ps, -- cgit v1.2.3 From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 May 2009 16:49:59 +0800 Subject: tracepoint: trace_sched_migrate_task(): remove parameter The orig_cpu parameter in trace_sched_migrate_task() is not necessary, it can be got by using task_cpu(p) in the probe. [ Impact: micro-optimization ] Signed-off-by: Mathieu Desnoyers [ modified from Mathieu's patch. The original patch is at: http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ] Signed-off-by: Xiao Guangrong Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org Cc: Li Zefan Cc: zhaolei@cn.fujitsu.com Cc: laijs@cn.fujitsu.com LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9f7ffd00b6e..9cdedbd181c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) -- cgit v1.2.3 From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:27:26 -0400 Subject: blktrace: correct remap names This attempts to clarify names utilized during block I/O remap operations (partition, volume manager). It correctly matches up the /from/ information for both device & sector. This takes in the concept from Kosaki Motohiro and extends it to include better naming for the "device_from" field. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF4FAE.3000301@hp.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c32062bd10b..f8d46d6f5d3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @from: source sector * @to: target sector + * @from: source sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t to, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); @@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s, static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) -- cgit v1.2.3 From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:35:08 -0400 Subject: blktrace: from-sector redundant in trace_block_remap Remove redundant from-sector parameter: it's /always/ the bio's sector passed in. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF517C.7000503@hp.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f8d46d6f5d3..e099f8cc1d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @to: target sector * @from: source sector * * Description: @@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t to, sector_t from) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** -- cgit v1.2.3 From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 6 May 2009 15:45:45 +0530 Subject: tracing: trace_output.c, fix false positive compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: CC kernel/trace/trace_output.o kernel/trace/trace_output.c: In function ‘register_ftrace_event’: kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function Is wrong as 'list' is always initialized - but GCC (4.3.2) does not recognize this relationship properly. Work around the warning by initializing the variable to NULL. [ Impact: fix false positive compiler warning ] Signed-off-by: Jaswinder Singh Rajput Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5fc51f0f75f..8bd9a2c1a46 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event) INIT_LIST_HEAD(&event->list); if (!event->type) { - struct list_head *list; + struct list_head *list = NULL; if (next_event_type > FTRACE_MAX_EVENT) { -- cgit v1.2.3 From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 10:26:45 -0400 Subject: ring-buffer: remove unneeded conditional in rb_reserve_next The code in __rb_reserve_next checks on page overflow if it is the original commiter and then resets the page back to the original setting. Although this is fine, and the code is correct, it is a bit fragil. Some experimental work I did breaks it easily. The better and more robust solution is to have all commiters that overflow the page, simply subtract what they added. [ Impact: more robust ring buffer account management ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 424129eb20a..03ed52b67db 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_event_set_padding(event); } - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); /* * If this was a commit entry that failed, @@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + local_sub(length, &tail_page->write); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); -- cgit v1.2.3 From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 12:40:51 -0400 Subject: ring-buffer: check for failed allocation in ring buffer benchmark The result of the allocation of the ring buffer read page in the ring buffer bench mark does not check the return to see if a page was actually allocated. This patch fixes that. [ Impact: avoid NULL dereference ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 747244acb8f..dcd75e9e49f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -84,6 +84,9 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; -- cgit v1.2.3 From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 15:30:07 -0400 Subject: ring-buffer: make moving the tail page a separate function Ingo Molnar thought the code would be cleaner if we used a function call instead of a goto for moving the tail page. After implementing this, it seems that gcc still inlines the result and the output is pretty much the same. Since this is considered a cleaner approach, might as well implement it. [ Impact: code clean up ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 03ed52b67db..3ae5ccf2c0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - struct buffer_page *next_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; - - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; - - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) - goto next_page; - - /* We reserved something on the buffer */ - - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); - - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); - - /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. - */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; - - return event; - - next_page: + unsigned long flags; next_page = tail_page; @@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); + + /* We reserved something on the buffer */ + + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; + + return event; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) -- cgit v1.2.3 From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 18:36:59 -0400 Subject: ring-buffer: change test to be more latency friendly The ring buffer benchmark/test runs a producer for 10 seconds. This is done with preemption and interrupts enabled. But if the kernel is not compiled with CONFIG_PREEMPT, it basically stops everything but interrupts for 10 seconds. Although this is just a test and is not for production, this attribute can be quite annoying. It can also spawn badness elsewhere. This patch solves the issues by calling "cond_resched" when the system is not compiled with CONFIG_PREEMPT. It also keeps track of the time spent to call cond_resched such that it does not go against the time calculations. That is, if the task schedules away, the time scheduled out is removed from the test data. Note, this only works for non PREEMPT because we do not know when the task is scheduled out if we have PREEMPT enabled. [ Impact: prevent test from stopping the world for 10 seconds ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index dcd75e9e49f..a26fc67b63b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,6 +185,35 @@ static void ring_buffer_consumer(void) complete(&read_done); } +/* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call cond_resched + * and also add any time that was lost by a rescedule. + */ +#ifdef CONFIG_PREEMPT +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ +} +#else +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ + struct timeval tv; + + cond_resched(); + do_gettimeofday(&tv); + if (tv.tv_usec < end_tv->tv_usec) { + tv.tv_usec += 1000000; + tv.tv_sec--; + } + start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; + start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; + if (start_tv->tv_usec > 1000000) { + start_tv->tv_usec -= 1000000; + start_tv->tv_sec++; + } +} +#endif + static void ring_buffer_producer(void) { struct timeval start_tv; @@ -221,6 +250,8 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); + sched_if_needed(&start_tv, &end_tv); + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.3 From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:54:09 -0400 Subject: tracing: reset ring buffer when removing modules with events Li Zefan found that there's a race using the event ids of events and modules. When a module is loaded, an event id is incremented. We only have 16 bits for event ids (65536) and there is a possible (but highly unlikely) race that we could load and unload a module that registers events so many times that the event id counter overflows. When it overflows, it then restarts and goes looking for available ids. An id is available if it was added by a module and released. The race is if you have one module add an id, and then is removed. Another module loaded can use that same event id. But if the old module still had events in the ring buffer, the new module's call back would get bogus data. At best (and most likely) the output would just be garbage. But if the module for some reason used pointers (not recommended) then this could potentially crash. The safest thing to do is just reset the ring buffer if a module that registered events is removed. [ Impact: prevent unpredictable results of event id overflows ] Reported-by: Li Zefan LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 9 +++++++++ 3 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4164a344e72..dd40d232034 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 777c6c3a0cd..ba25793ffe6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -409,6 +409,8 @@ int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, mode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d579ff2361..6d2c842a024 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; + bool found = false; list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { + found = true; if (call->enabled) { call->enabled = 0; call->unregfunc(); @@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); } static int trace_module_notify(struct notifier_block *self, -- cgit v1.2.3 From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 22:52:15 -0400 Subject: tracing: add hierarchical enabling of events With the current event directory, you can only enable individual events. The file debugfs/tracing/set_event is used to be able to enable or disable several events at once. But that can still be awkward. This patch adds hierarchical enabling of events. That is, each directory in debugfs/tracing/events has an "enable" file. This file can enable or disable all events within the directory and below. # echo 1 > /debugfs/tracing/events/enable will enable all events. # echo 1 > /debugfs/tracing/events/sched/enable will enable all events in the sched subsystem. # echo 1 > /debugfs/tracing/events/enable # echo 0 > /debugfs/tracing/events/irq/enable will enable all events, but then disable just the irq subsystem events. When reading one of these enable files, there are four results: 0 - all events this file affects are disabled 1 - all events this file affects are enabled X - there is a mixture of events enabled and disabled ? - this file does not affect any event Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6d2c842a024..87feb0117ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = -1; + int all = 0; + int ret; + + if (system[0] == '*') + all = 1; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (!all && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + if (call->enabled) { + switch (set) { + case -1: + set = 1; + break; + case 0: + set = 2; + break; + } + } else { + switch (set) { + case -1: + set = 0; + break; + case 1: + set = 2; + break; + } + } + /* + * If we have a mixture, no need to look further. + */ + if (set == 2) + break; + } + mutex_unlock(&event_mutex); + + buf[1] = '\n'; + switch (set) { + case 0: + buf[0] = '0'; + break; + case 1: + buf[0] = '1'; + break; + case 2: + buf[0] = 'X'; + break; + default: + buf[0] = '?'; + } + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char *command; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + break; + + default: + return -EINVAL; + } + + command = kstrdup(system, GFP_KERNEL); + if (!command) + return -ENOMEM; + + ret = ftrace_set_clr_event(command, val); + if (ret) + goto out_free; + + ret = cnt; + + out_free: + kfree(command); + + *ppos += cnt; + + return ret; +} + extern char *__bad_type_size(void); #undef FIELD @@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, @@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); + return system->entry; } @@ -1041,6 +1178,9 @@ static __init int event_trace_init(void) ring_buffer_print_entry_header, &ftrace_show_header_fops); + trace_create_file("enable", 0644, d_events, + "*:*", &ftrace_system_enable_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) -- cgit v1.2.3 From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 4 May 2009 01:38:05 -0700 Subject: sched: emit thread info flags with stack trace When a thread is oom killed and fails to exit, it's helpful to know which threads have access to memory reserves if the machine livelocks. This is done by testing for the TIF_MEMDIE thread info flag and should be displayed alongside stack traces to identify tasks that have access to such reserves but are still stuck allocating pages, for instance. It would probably be helpful in other cases as well, so all thread info flags are emitted when showing a task. ( v2: fix warning reported by Stephen Rothwell ) [ Impact: extend debug printout info ] Signed-off-by: David Rientjes Cc: Peter Zijlstra Cc: Stephen Rothwell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2a43a581ead..5aa63f50c69 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } -- cgit v1.2.3 From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 02:48:52 +0200 Subject: tracing/filters: support for filters of dynamic sized arrays Currently the filtering infrastructure supports well the numeric types and fixed sized array types. But the recently added __string() field uses a specific indirect offset mechanism which requires a specific predicate. Until now it wasn't supported. This patch adds this support and implies very few changes, only a new predicate is needed, the management of this specific field can be done through the usual string helpers in the filtering infrastructure. [ Impact: support all kinds of strings in the tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 44 +++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7ac69108527..01c76eb3e16 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)), return val1 || val2; } +/* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event, int val1, int val2) { @@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) +{ + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event, int val1, int val2) { @@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; + + if (!strcmp(type, "__str_loc")) + return FILTER_DYN_STRING; return 0; } @@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + int string_type; pred->fn = filter_pred_none; @@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field->type)) { - fn = filter_pred_string; + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; pred->str_len = field->size; if (pred->op == OP_NE) pred->not = 1; -- cgit v1.2.3 From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 03:03:57 +0200 Subject: tracing/filters: support for operator reserved characters in strings When we set a filter for an event, such as: echo "name == my_lock_name" > \ /debug/tracing/events/lockdep/lock_acquired/filter then the following order of token type is parsed: - space - operator - parentheses - operand Because the operators and parentheses have a higher precedence than the operand characters, which is normal, then we can't use any string containing such special characters: ()=<>!&| To get this support and also avoid ambiguous intepretation from the parser or the human, we can do it using double quotes so that we keep the usual languages habits. Then after this patch you can still declare string condition like before: echo name == myname But if you want to compare against a string containing an operator character, you can use double quotes: echo 'name == "&myname"' Don't forget to include the whole expression into single quotes or the double ones will be eaten by echo. [ Impact: support strings with special characters for tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 01c76eb3e16..8c62e5bdff0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + int in_string = 0; int op, top_op; char ch; while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + if (isspace(ch)) continue; @@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps) } continue; } +parse_operand: if (append_operand_char(ps, ch)) { parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); return -EINVAL; -- cgit v1.2.3 From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 7 May 2009 15:11:15 +0800 Subject: tracing/events: fix concurrent access to ftrace_events list, fix In filter_add_subsystem_pred() we should release event_mutex before calling filter_free_subsystem_preds(), since both functions hold event_mutex. [ Impact: fix deadlock when writing invalid pred into subsystem filter ] Signed-off-by: Li Zefan Cc: tzanussi@gmail.com Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <4A028993.7020509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8c62e5bdff0..85ad6a8939a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { + mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - break; + goto out; } replace_filter_string(call->filter, filter_string); } mutex_unlock(&event_mutex); - +out: return err; } -- cgit v1.2.3 From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:13:42 -0400 Subject: ring-buffer: remove complex calculations in ring-buffer-test Ingo Molnar thought that the code to calculate the time in cond_resched is a bit too ugly and is not needed. This patch removes it and replaces it with a simple call to cond_resched. I kept the comment that explains the reason for the cond_resched. [ Impact: remove ugly code ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 37 +++++++----------------------------- 1 file changed, 7 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a26fc67b63b..f4ceb453c7d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,35 +185,6 @@ static void ring_buffer_consumer(void) complete(&read_done); } -/* - * If we are a non preempt kernel, the 10 second run will - * stop everything while it runs. Instead, we will call cond_resched - * and also add any time that was lost by a rescedule. - */ -#ifdef CONFIG_PREEMPT -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ -} -#else -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ - struct timeval tv; - - cond_resched(); - do_gettimeofday(&tv); - if (tv.tv_usec < end_tv->tv_usec) { - tv.tv_usec += 1000000; - tv.tv_sec--; - } - start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; - start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; - if (start_tv->tv_usec > 1000000) { - start_tv->tv_usec -= 1000000; - start_tv->tv_sec++; - } -} -#endif - static void ring_buffer_producer(void) { struct timeval start_tv; @@ -250,7 +221,13 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); - sched_if_needed(&start_tv, &end_tv); + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + */ + cond_resched(); } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.3 From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:49:35 -0400 Subject: tracing: append ":*" to internal setting of system events The system enabling of events uses the same code as the set_event file. It passes in the name of the system to the parser and that will enable all the events that has that system as a name. The problem is that it will also enable events with the same name as the system. If you have system name foo, and system name bar, but within the system bar, there exists an event called foo. By setting the system name foo, you will also be enabling the event foo in the system bar. This is not an expected result. The solution is to pass in "foo:*", which will only enable the system foo and not events called foo. [ Impact: prevent accidental enabling of events with same name as a system ] Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87feb0117ce..8d0fae3af59 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; } - command = kstrdup(system, GFP_KERNEL); + /* +3 for the ":*\0" */ + command = kmalloc(strlen(system)+3, GFP_KERNEL); if (!command) return -ENOMEM; + sprintf(command, "%s:*", system); ret = ftrace_set_clr_event(command, val); if (ret) @@ -1179,7 +1181,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*:*", &ftrace_system_enable_fops); + "*", &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ -- cgit v1.2.3 From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 12:49:27 -0400 Subject: tracing: have menu default enabled when kernel debug is configured Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is enabled it is nice to enable the trace menu as well. This patch only make the tracing menu enabled by default, it does not make any of the tracers enabled. And the menu is only enabled by default if DEBUG_KERNEL is enabled. [ Impact: show tracing options to those debugging the kernel ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 50f62a296e1..f61be301578 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,6 +79,7 @@ if TRACING_SUPPORT menuconfig FTRACE bool "Tracers" + default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. -- cgit v1.2.3 From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 14:20:28 -0400 Subject: ring-buffer: only periodically call cond_resched to ring-buffer-benchmark Calling cond_resched at every iteration of the loop adds a bit of overhead to the benchmark. This patch does two things. 1) only calls cond-resched when CONFIG_PREEMPT is not enabled 2) only calls cond-resched after so many traces has been performed. [ Impact: less overhead to the ring-buffer-benchmark ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index f4ceb453c7d..a7c048bb446 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -218,16 +218,23 @@ static void ring_buffer_producer(void) } do_gettimeofday(&end_tv); - if (consumer && !(++cnt % wakeup_interval)) + cnt++; + if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); +#ifndef CONFIG_PREEMPT /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. */ - cond_resched(); + if (cnt % wakeup_interval) + cond_resched(); +#endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.3 From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:52:20 -0400 Subject: ring-buffer: add total count in ring-buffer-benchmark It is nice to see the overhead of the benchmark test when tracing is disabled. That is, we turn off the ring buffer just to see what the cost of running the loop that calls into the ring buffer is. Currently, if no entries wer made, we get 0. This is not informative. This patch changes it to check if we had any "missed" (non recorded) events. If so, a total count is also reported. [ Impact: evaluate the over head of the ring buffer benchmark test ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a7c048bb446..a21aa7b3d05 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -285,6 +285,17 @@ static void ring_buffer_producer(void) avg = 1000000 / hit; pr_info("%ld ns per entry\n", avg); } + + + if (missed) { + if (time) + missed /= (long)time; + + pr_info("Total iterations per millisec: %ld\n", hit + missed); + + avg = 1000000 / (hit + missed); + pr_info("%ld ns per entry\n", avg); + } } static void wait_to_die(void) -- cgit v1.2.3 From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:58:55 -0400 Subject: ring-buffer: change WARN_ON from checking preempt_count to preemptible There's a WARN_ON in the ring buffer code that makes sure preemption is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not enabled, preempt_count() is always zero, and this will trigger the warning. [ Impact: prevent false warning on non preemptible kernels ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ae5ccf2c0f..361170609bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, !preempt_count()); + RB_WARN_ON(buffer, preemptible()); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; -- cgit v1.2.3 From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:31:42 +0800 Subject: tracing/events: clean up for ftrace_set_clr_event() Add a helper function __ftrace_set_clr_event(), and replace some ftrace_set_clr_event() calls with this helper, thus we don't need any kstrdup() or kmalloc(). As a side effect, this patch fixes an issue in self tests code, which is similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 ("tracing: append ":*" to internal setting of system events") It's a small issue and won't cause any bug in fact, but we should do things right anyway. [ Impact: prevent spurious event-enabling in tracing self-tests ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A03998E.3020503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 126 ++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d0fae3af59..45f1099386b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -111,11 +111,44 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } -static int ftrace_set_clr_event(char *buf, int set) +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) { struct ftrace_event_call *call; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->name || !call->regfunc) + continue; + + if (match && + strcmp(match, call->name) != 0 && + strcmp(match, call->system) != 0) + continue; + + if (sub && strcmp(sub, call->system) != 0) + continue; + + if (event && strcmp(event, call->name) != 0) + continue; + + ftrace_event_enable_disable(call, set); + + ret = 0; + } + mutex_unlock(&event_mutex); + + return ret; +} + +static int ftrace_set_clr_event(char *buf, int set) +{ char *event = NULL, *sub = NULL, *match; - int ret = -EINVAL; /* * The buf format can be : @@ -141,30 +174,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->name || !call->regfunc) - continue; - - if (match && - strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) - continue; - - if (sub && strcmp(sub, call->system) != 0) - continue; - - if (event && strcmp(event, call->name) != 0) - continue; - - ftrace_event_enable_disable(call, set); - - ret = 0; - } - mutex_unlock(&event_mutex); - - return ret; + return __ftrace_set_clr_event(match, sub, event, set); } /* 128 should be much more than enough */ @@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call; char buf[2]; int set = -1; - int all = 0; int ret; - if (system[0] == '*') - all = 1; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; - if (!all && strcmp(call->system, system) != 0) + if (system && strcmp(call->system, system) != 0) continue; /* @@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char *command; char buf[64]; ssize_t ret; @@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - case 1: - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } - /* +3 for the ":*\0" */ - command = kmalloc(strlen(system)+3, GFP_KERNEL); - if (!command) - return -ENOMEM; - sprintf(command, "%s:*", system); - - ret = ftrace_set_clr_event(command, val); + ret = __ftrace_set_clr_event(NULL, system, NULL, val); if (ret) - goto out_free; + goto out; ret = cnt; - out_free: - kfree(command); - +out: *ppos += cnt; return ret; @@ -1181,7 +1172,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*", &ftrace_system_enable_fops); + NULL, &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ @@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; - char *sysname; int ret; pr_info("Running tests on trace events:\n"); @@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - /* ftrace_set_clr_event can modify the name passed in. */ - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 1); - kfree(sysname); + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) pr_warning("error disabling system %s\n", system->name); @@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - sysname = kmalloc(4, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 1); + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - kfree(sysname); pr_warning("error enabling all events\n"); return; } @@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; -- cgit v1.2.3 From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:32:05 +0800 Subject: tracing/events: simplify system_enable_read() A smarter way to figure out the output of an enable file. [ Impact: clean up ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f1099386b..df394bc6d54 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -414,10 +414,11 @@ static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + const char set_to_char[4] = { '?', '0', '1', 'X' }; const char *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; - int set = -1; + int set = 0; int ret; mutex_lock(&event_mutex); @@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - if (call->enabled) { - switch (set) { - case -1: - set = 1; - break; - case 0: - set = 2; - break; - } - } else { - switch (set) { - case -1: - set = 0; - break; - case 1: - set = 2; - break; - } - } + set |= (1 << !!call->enabled); + /* * If we have a mixture, no need to look further. */ - if (set == 2) + if (set == 3) break; } mutex_unlock(&event_mutex); + buf[0] = set_to_char[set]; buf[1] = '\n'; - switch (set) { - case 0: - buf[0] = '0'; - break; - case 1: - buf[0] = '1'; - break; - case 2: - buf[0] = 'X'; - break; - default: - buf[0] = '?'; - } ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); -- cgit v1.2.3 From 7fc23a5380797012e92a9633169440f2f4a21253 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:21 +0200 Subject: perf_counter: optimize perf_counter_task_tick() perf_counter_task_tick() does way too much work to find out there's nothing to do. Provide an easy short-circuit for the normal case where there are no counters on the system. [ Impact: micro-optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.750619201@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 60e55f0b48f..fdb0d242127 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_tracking __read_mostly; static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; @@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_counter_context *ctx) void perf_counter_task_tick(struct task_struct *curr, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + if (!atomic_read(&nr_counters)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &curr->perf_counter_ctx; perf_counter_cpu_sched_out(cpuctx); perf_counter_task_sched_out(curr, cpu); @@ -1197,6 +1204,7 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); + atomic_dec(&nr_counters); if (counter->hw_event.mmap) atomic_dec(&nr_mmap_tracking); if (counter->hw_event.munmap) @@ -2861,6 +2869,7 @@ done: counter->pmu = pmu; + atomic_inc(&nr_counters); if (counter->hw_event.mmap) atomic_inc(&nr_mmap_tracking); if (counter->hw_event.munmap) -- cgit v1.2.3 From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:22 +0200 Subject: perf_counter: rework ioctl()s Corey noticed that ioctl()s on grouped counters didn't work on the whole group. This extends the ioctl() interface to take a second argument that is interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle the behaviour. Having this flag gives the greatest flexibility, allowing you to individually enable/disable/reset counters in a group, or all together. [ Impact: fix group counter enable/disable semantics ] Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090508170028.837558214@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 104 ++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fdb0d242127..f4883f1f47e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) * add it straight to the context's counter list, or to the group * leader's sibling list: */ - if (counter->group_leader == counter) + if (group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); @@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } -/* - * Disable a counter and all its children. - */ -static void perf_counter_disable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_disable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_disable(child); - mutex_unlock(&counter->mutex); -} - static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) return 0; } -/* - * Enable a counter and all its children. - */ -static void perf_counter_enable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_enable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_enable(child); - mutex_unlock(&counter->mutex); -} - void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { @@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_counter_reset(struct perf_counter *counter) { + (void)perf_counter_read(counter); atomic_set(&counter->count, 0); + perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; + + spin_lock_irq(&ctx->lock); + counter = counter->group_leader; + + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + func(sibling); + spin_unlock_irq(&ctx->lock); +} + +static void perf_counter_for_each_child(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + func(counter); + list_for_each_entry(child, &counter->child_list, child_list) + func(child); + mutex_unlock(&counter->mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + perf_counter_for_each_sibling(counter, func); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_for_each_sibling(child, func); + mutex_unlock(&counter->mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; - int err = 0; + void (*func)(struct perf_counter *); + u32 flags = arg; switch (cmd) { case PERF_COUNTER_IOC_ENABLE: - perf_counter_enable_family(counter); + func = perf_counter_enable; break; case PERF_COUNTER_IOC_DISABLE: - perf_counter_disable_family(counter); - break; - case PERF_COUNTER_IOC_REFRESH: - err = perf_counter_refresh(counter, arg); + func = perf_counter_disable; break; case PERF_COUNTER_IOC_RESET: - perf_counter_reset(counter); + func = perf_counter_reset; break; + + case PERF_COUNTER_IOC_REFRESH: + return perf_counter_refresh(counter, arg); default: - err = -ENOTTY; + return -ENOTTY; } - return err; + + if (flags & PERF_IOC_FLAG_GROUP) + perf_counter_for_each(counter, func); + else + perf_counter_for_each_child(counter, func); + + return 0; } /* -- cgit v1.2.3 From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:23 +0200 Subject: perf_counter: add PERF_RECORD_CONFIG Much like CONFIG_RECORD_GROUP records the hw_event.config to identify the values, allow to record this for all counters. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.923228280@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f4883f1f47e..c615f52aa40 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CONFIG) { + header.type |= PERF_RECORD_CONFIG; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_ADDR) perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_CONFIG) + perf_output_put(&handle, counter->hw_event.config); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ -- cgit v1.2.3 From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:24 +0200 Subject: perf_counter: add PERF_RECORD_CPU Allow recording the CPU number the event was generated on. RFC: this leaves a u32 as reserved, should we fill in the node_id() there, or leave this open for future extention, as userspace can already easily do the cpu->node mapping if needed. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170029.008627711@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c615f52aa40..d850a1fb8d4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter, struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; + struct { + u32 cpu, reserved; + } cpu_entry; header.type = 0; header.size = sizeof(header); @@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CPU) { + header.type |= PERF_RECORD_CPU; + header.size += sizeof(cpu_entry); + + cpu_entry.cpu = raw_smp_processor_id(); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_CONFIG) perf_output_put(&handle, counter->hw_event.config); + if (record_type & PERF_RECORD_CPU) + perf_output_put(&handle, cpu_entry); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ -- cgit v1.2.3 From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:06:47 -0400 Subject: tracing: initialize return value for __ftrace_set_clr_event Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 tracing/events: clean up for ftrace_set_clr_event() Moved out the code for ftrace_set_clr_event into a helper funciton but did not initialize the return value. As a result, we do not warn about a typo in the echoing of events in set_event. This patch restores the old warning: # echo foobar > set_event -bash: echo: write error: Invalid argument [ Impact: restore warning of invalid entries to set_event ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df394bc6d54..2eecb87e42d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, const char *event, int set) { struct ftrace_event_call *call; - int ret; + int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { -- cgit v1.2.3 From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:27:41 -0400 Subject: tracing: add trace_set_clr_event to export event enabling function Other parts of the kernel may need to be able to enable or disable specific events. Especially parts that create trace events. [ Impact: allow enabling of trace events by those that create the event ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2eecb87e42d..0eec0c55dd8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set) return __ftrace_set_clr_event(match, sub, event, set); } +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3 From 92d23f703c608fcb2c8edd74a3fd0f4031e18606 Mon Sep 17 00:00:00 2001 From: Ron Date: Fri, 8 May 2009 22:54:49 +0930 Subject: sched: Fix fallback sched_clock()'s offset when using jiffies Account for the initial offset to the jiffy count. [ Impact: fix printk timestamps on architectures using fallback sched_clock() ] Signed-off-by: Ron Lee Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17ac796..e1d16c9a768 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); } static __read_mostly int sched_clock_running; -- cgit v1.2.3 From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:22 +0100 Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against ptrace Rename cred_exec_mutex to reflect that it's a guard against foreign intervention on a process's credential state, such as is made by ptrace(). The attachment of a debugger to a process affects execve()'s calculation of the new credential state - _and_ also setprocattr()'s calculation of that state. Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 4 ++-- kernel/ptrace.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 3a039189d70..1bb4d7e5d61 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_exec_mutex + * - The caller must hold current->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_exec_mutex); + mutex_init(&p->cred_guard_mutex); if ( #ifdef CONFIG_KEYS diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..27ac80298bf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,10 +185,11 @@ int ptrace_attach(struct task_struct *task) if (same_thread_group(task, current)) goto out; - /* Protect exec's credential calculations against our interference; - * SUID, SGID and LSM creds get determined differently under ptrace. + /* Protect the target's credential calculations against our + * interference; SUID, SGID and LSM creds get determined differently + * under ptrace. */ - retval = mutex_lock_interruptible(&task->cred_exec_mutex); + retval = mutex_lock_interruptible(&task->cred_guard_mutex); if (retval < 0) goto out; @@ -232,7 +233,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); - mutex_unlock(&task->cred_exec_mutex); + mutex_unlock(&task->cred_guard_mutex); out: return retval; } -- cgit v1.2.3 From 5b93629b4509c03ffa87a9316412fedf6f58cb37 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:38 +0900 Subject: block: implement blk_rq_pos/[cur_]sectors() and convert obvious ones Implement accessors - blk_rq_pos(), blk_rq_sectors() and blk_rq_cur_sectors() which return rq->hard_sector, rq->hard_nr_sectors and rq->hard_cur_sectors respectively and convert direct references of the said fields to the accessors. This is in preparation of request data length handling cleanup. Geert : suggested adding const to struct request * parameter to accessors Sergei : spotted error in patch description [ Impact: cleanup ] Signed-off-by: Tejun Heo Acked-by: Geert Uytterhoeven Acked-by: Stephen Rothwell Tested-by: Grant Likely Acked-by: Grant Likely Ackec-by: Sergei Shtylyov Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: James Bottomley Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..42f1c11e754 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -646,7 +646,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, rw, what, rq->errors, 0, NULL); } } @@ -857,7 +857,7 @@ void blk_add_driver_data(struct request_queue *q, __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); -- cgit v1.2.3 From 2e46e8b27aa57c6bd34b3102b40ee4d0144b4fab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:41 +0900 Subject: block: drop request->hard_* and *nr_sectors struct request has had a few different ways to represent some properties of a request. ->hard_* represent block layer's view of the request progress (completion cursor) and the ones without the prefix are supposed to represent the issue cursor and allowed to be updated as necessary by the low level drivers. The thing is that as block layer supports partial completion, the two cursors really aren't necessary and only cause confusion. In addition, manual management of request detail from low level drivers is cumbersome and error-prone at the very least. Another interesting duplicate fields are rq->[hard_]nr_sectors and rq->{hard_cur|current}_nr_sectors against rq->data_len and rq->bio->bi_size. This is more convoluted than the hard_ case. rq->[hard_]nr_sectors are initialized for requests with bio but blk_rq_bytes() uses it only for !pc requests. rq->data_len is initialized for all request but blk_rq_bytes() uses it only for pc requests. This causes good amount of confusion throughout block layer and its drivers and determining the request length has been a bit of black magic which may or may not work depending on circumstances and what the specific LLD is actually doing. rq->{hard_cur|current}_nr_sectors represent the number of sectors in the contiguous data area at the front. This is mainly used by drivers which transfers data by walking request segment-by-segment. This value always equals rq->bio->bi_size >> 9. However, data length for pc requests may not be multiple of 512 bytes and using this field becomes a bit confusing. In general, having multiple fields to represent the same property leads only to confusion and subtle bugs. With recent block low level driver cleanups, no driver is accessing or manipulating these duplicate fields directly. Drop all the duplicates. Now rq->sector means the current sector, rq->data_len the current total length and rq->bio->bi_size the current segment length. Everything else is defined in terms of these three and available only through accessors. * blk_recalc_rq_sectors() is collapsed into blk_update_request() and now handles pc and fs requests equally other than rq->sector update. This means that now pc requests can use partial completion too (no in-kernel user yet tho). * bio_cur_sectors() is replaced with bio_cur_bytes() as block layer now uses byte count as the primary data length. * blk_rq_pos() is now guranteed to be always correct. In-block users converted. * blk_rq_bytes() is now guaranteed to be always valid as is blk_rq_sectors(). In-block users converted. * blk_rq_sectors() is now guaranteed to equal blk_rq_bytes() >> 9. More convenient one is used. * blk_rq_bytes() and blk_rq_cur_bytes() are now inlined and take const pointer to request. [ Impact: API cleanup, single way to represent one property of a request ] Signed-off-by: Tejun Heo Cc: Boaz Harrosh Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 42f1c11e754..5708a14bee5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -642,12 +642,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (blk_pc_request(rq)) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - rq->cmd_len, rq->cmd); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, - rw, what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, + what, rq->errors, 0, NULL); } } @@ -854,11 +854,11 @@ void blk_add_driver_data(struct request_queue *q, return; if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); -- cgit v1.2.3 From 6751b71ea2c7ab8c0d65f01973a3fc8ea16992f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 12:08:02 +1000 Subject: perf_counter: Put whole group on when enabling group leader Currently, if you have a group where the leader is disabled and there are siblings that are enabled, and then you enable the leader, we only put the leader on the PMU, and not its enabled siblings. This is incorrect, since the enabled group members should be all on or all off at any given point. This fixes it by adding a call to group_sched_in in __perf_counter_enable in the case where we're enabling a group leader. To avoid the need for a forward declaration this also moves group_sched_in up before __perf_counter_enable. The actual content of group_sched_in is unchanged by this patch. [ Impact: fix bug in counter enable code ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.34946.451546.691693@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 99 ++++++++++++++++++++++++++------------------------- 1 file changed, 51 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d850a1fb8d4..a5bdc93ac47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -419,6 +419,54 @@ counter_sched_in(struct perf_counter *counter, return 0; } +static int +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter, *partial_group; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + group_counter->prev_state = group_counter->state; + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; +} + /* * Return 1 for a group consisting entirely of software counters, * 0 if the group contains any hardware counters. @@ -643,6 +691,9 @@ static void __perf_counter_enable(void *info) if (!group_can_go_on(counter, cpuctx, 1)) err = -EEXIST; + else if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); else err = counter_sched_in(counter, cpuctx, ctx, smp_processor_id()); @@ -791,54 +842,6 @@ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) __perf_counter_sched_out(&cpuctx->ctx, cpuctx); } -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - group_counter->prev_state = group_counter->state; - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) -- cgit v1.2.3 From a08b159fc243dbfe415250466d24cfc5010deee5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 15:46:10 +1000 Subject: perf_counter: don't count scheduler ticks as context switches The context-switch software counter gives inflated values at present because each scheduler tick and each process-wide counter enable/disable prctl gets counted as a context switch. This happens because perf_counter_task_tick, perf_counter_task_disable and perf_counter_task_enable all call perf_counter_task_sched_out, which calls perf_swcounter_event to record a context switch event. This fixes it by introducing a variant of perf_counter_task_sched_out with two underscores in front for internal use within the perf_counter code, and makes perf_counter_task_{tick,disable,enable} call it. This variant doesn't record a context switch event, and takes a struct perf_counter_context *. This adds the new variant rather than changing the behaviour or interface of perf_counter_task_sched_out because that is called from other code. [ Impact: fix inflated context-switch event counts ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.48034.485580.498953@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a5bdc93ac47..7373b96bc36 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -837,6 +837,14 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) cpuctx->task_ctx = NULL; } +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { __perf_counter_sched_out(&cpuctx->ctx, cpuctx); @@ -943,15 +951,13 @@ int perf_counter_task_disable(void) struct perf_counter *counter; unsigned long flags; u64 perf_flags; - int cpu; if (likely(!ctx->nr_counters)) return 0; local_irq_save(flags); - cpu = smp_processor_id(); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); @@ -989,7 +995,7 @@ int perf_counter_task_enable(void) local_irq_save(flags); cpu = smp_processor_id(); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); @@ -1054,7 +1060,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) ctx = &curr->perf_counter_ctx; perf_counter_cpu_sched_out(cpuctx); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); -- cgit v1.2.3 From 615a3f1e055ac9b0ae74e1f935a12ea2cfe2a2ad Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 15:50:21 +1000 Subject: perf_counter: call atomic64_set for counter->count A compile warning triggered because we are calling atomic_set(&counter->count). But since counter->count is an atomic64_t, we have to use atomic64_set. So the count can be set short, resulting in the reset ioctl only resetting the low word. [ Impact: clear counter properly during the reset ioctl ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.48285.270311.981806@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7373b96bc36..5ea0240adab 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1281,7 +1281,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_counter_reset(struct perf_counter *counter) { (void)perf_counter_read(counter); - atomic_set(&counter->count, 0); + atomic64_set(&counter->count, 0); perf_counter_update_userpage(counter); } -- cgit v1.2.3 From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 14:33:23 +0800 Subject: blktrace: pdu_buf of pc events should be unsigned I got this: 8,0 1 305.417782332 2037 I R 32 (ffffff9e 10 00 ...) [bash] It should be: 8,0 1 305.417782332 2037 I R 32 (9e 10 00 ...) [bash] [ Impact: fix output of pc events ] Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e099f8cc1d1..05b4747fd87 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { - const char *pdu_buf; + const unsigned char *pdu_buf; int pdu_len; int i, end, ret; -- cgit v1.2.3 From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 10:56:33 -0400 Subject: ring-buffer: replace constants with time macros in ring-buffer-benchmark The use of numeric constants is discouraged. It is cleaner and more descriptive to use macros for constant time conversions. This patch also removes an extra new line. [ Impact: more descriptive time conversions ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a21aa7b3d05..7d3aef93c49 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -253,7 +253,7 @@ static void ring_buffer_producer(void) } time = end_tv.tv_sec - start_tv.tv_sec; - time *= 1000000; + time *= USEC_PER_SEC; time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); entries = ring_buffer_entries(buffer); @@ -273,7 +273,8 @@ static void ring_buffer_producer(void) pr_info("Missed: %ld\n", missed); pr_info("Hit: %ld\n", hit); - do_div(time, 1000); + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else @@ -282,18 +283,19 @@ static void ring_buffer_producer(void) pr_info("Entries per millisec: %ld\n", hit); if (hit) { - avg = 1000000 / hit; + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; pr_info("%ld ns per entry\n", avg); } - if (missed) { if (time) missed /= (long)time; pr_info("Total iterations per millisec: %ld\n", hit + missed); - avg = 1000000 / (hit + missed); + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); } } -- cgit v1.2.3 From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 11:03:57 -0400 Subject: ring-buffer: check for divide by zero in ring-buffer-benchmark Although we check if "missed" is not zero, we divide by hit + missed, and the addition can possible overflow and become a divide by zero. This patch checks for this case, and will report it when it happens then modify "hit" to make the calculation be non zero. [ Impact: prevent possible divide by zero in ring-buffer-benchmark ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 7d3aef93c49..8d68e149a8b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -294,6 +294,12 @@ static void ring_buffer_producer(void) pr_info("Total iterations per millisec: %ld\n", hit + missed); + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + pr_info("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); -- cgit v1.2.3 From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:08:09 -0400 Subject: ring-buffer: remove type parameter from rb_reserve_next_event The rb_reserve_next_event is only called for the data type (type = 0). There is no reason to pass in the type to the function. Before: text data bss dec hex filename 16554 24 12 16590 40ce kernel/trace/ring_buffer.o After: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o [ Impact: cleaner, smaller and slightly more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 361170609bd..fe40f6c3507 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; u64 ts, delta; @@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Non commits have zero deltas */ delta = 0; - event = __rb_reserve_next(cpu_buffer, type, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); if (PTR_ERR(event) == -EAGAIN) goto again; @@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, 0, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, 0, event_length); + event = rb_reserve_next_event(cpu_buffer, event_length); if (!event) goto out; -- cgit v1.2.3 From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:42:53 -0400 Subject: ring-buffer: move calculation of event length The event length is calculated and passed in to rb_reserve_next_event in two different locations. Having rb_reserve_next_event do the calculations directly makes only one location to do the change and causes the calculation to be inlined by gcc. Before: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o After: text data bss dec hex filename 16490 24 12 16526 408e kernel/trace/ring_buffer.o [ Impact: smaller more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fe40f6c3507..493cba46abc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; event = rb_reserve_next_event(cpu_buffer, length); @@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; -- cgit v1.2.3 From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 16:54:11 -0700 Subject: x86: add extension fields for bootloader type and version A long ago, in days of yore, it all began with a god named Thor. There were vikings and boats and some plans for a Linux kernel header. Unfortunately, a single 8-bit field was used for bootloader type and version. This has generally worked without *too* much pain, but we're getting close to flat running out of ID fields. Add extension fields for both type and version. The type will be extended if it the old field is 0xE; the version is a simple MSB extension. Keep /proc/sys/kernel/bootloader_type containing (type << 4) + (ver & 0xf) for backwards compatiblity, but also add /proc/sys/kernel/bootloader_version which contains the full version number. [ Impact: new feature to support more bootloaders ] Signed-off-by: H. Peter Anvin --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..cf91c9317b2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -727,6 +727,14 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", -- cgit v1.2.3 From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:08:00 -0400 Subject: ring-buffer: small optimizations Doing some small changes in the fast path of the ring buffer recording saves over 3% in the ring-buffer-benchmark test. [ Impact: a little faster ring buffer recording ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 493cba46abc..f452de2ce49 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static int +static inline int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { delta = ts - cpu_buffer->write_stamp; @@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(ts < cpu_buffer->write_stamp)) delta = 0; - if (test_time_stamp(delta)) { + else if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); @@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * If the timestamp was commited, make the commit our entry * now so that we will update it when needed. */ - if (commit) + if (unlikely(commit)) rb_set_commit_event(cpu_buffer, event); else if (!rb_is_commit(cpu_buffer, event)) delta = 0; -- cgit v1.2.3 From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:28:23 -0400 Subject: ring-buffer: use internal time stamp function The ring_buffer_time_stamp that is exported adds a little more overhead than is needed for using it internally. This patch adds an internal timestamp function that can be inlined (a single line function) and used internally for the ring buffer. [ Impact: a little less overhead to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f452de2ce49..a9e645a5bc1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -454,13 +454,18 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = buffer->clock() << DEBUG_SHIFT; + time = rb_time_stamp(buffer, cpu); preempt_enable_no_resched_notrace(); return time; @@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. -- cgit v1.2.3 From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 22:11:05 -0400 Subject: ring-buffer: move code around to remove some branches This is a bit of micro-optimizations. But since the ring buffer is used in tracing every function call, it is an extreme hot path. Every nanosecond counts. This change shows over 5% improvement in the ring-buffer-benchmark. [ Impact: more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9e645a5bc1..16b24d49604 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; @@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && rb_page_write(cpu_buffer->tail_page) == rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - else if (unlikely(test_time_stamp(delta))) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) return NULL; @@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } + get_event: event = __rb_reserve_next(cpu_buffer, 0, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) { -- cgit v1.2.3 From e758a33d6fc5b9d6a3ae489863d04fcecad8120b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 May 2009 21:59:01 +1000 Subject: perf_counter: call hw_perf_save_disable/restore around group_sched_in I noticed that when enabling a group via the PERF_COUNTER_IOC_ENABLE ioctl on the group leader, the counters weren't enabled and counting immediately on return from the ioctl, but did start counting a little while later (presumably after a context switch). The reason was that __perf_counter_enable calls group_sched_in which calls hw_perf_group_sched_in, which on powerpc assumes that the caller has called hw_perf_save_disable already. Until commit 46d686c6 ("perf_counter: put whole group on when enabling group leader") it was true that all callers of group_sched_in had called hw_perf_save_disable first, and the powerpc hw_perf_group_sched_in relies on that (there isn't an x86 version). This fixes the problem by putting calls to hw_perf_save_disable / hw_perf_restore around the calls to group_sched_in and counter_sched_in in __perf_counter_enable. Having the calls to hw_perf_save_disable/restore around the counter_sched_in call is harmless and makes this call consistent with the other call sites of counter_sched_in, which have all called hw_perf_save_disable first. [ Impact: more precise counter group disable/enable functionality ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18953.25733.53359.147452@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5ea0240adab..ff166c11b69 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -663,6 +663,7 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; + unsigned long pmuflags; unsigned long flags; int err; @@ -689,14 +690,18 @@ static void __perf_counter_enable(void *info) if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; - else if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); + } else { + pmuflags = hw_perf_save_disable(); + if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + hw_perf_restore(pmuflags); + } if (err) { /* -- cgit v1.2.3 From d80c19df5fcceb8c741e96f09f275c2da719efef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 May 2009 16:29:13 +0200 Subject: lockdep: increase MAX_LOCKDEP_ENTRIES and MAX_LOCKDEP_CHAINS Now that lockdep coverage has increased it has become easier to run out of entries: [ 21.401387] BUG: MAX_LOCKDEP_ENTRIES too low! [ 21.402007] turning off the locking correctness validator. [ 21.402007] Pid: 1555, comm: S99local Not tainted 2.6.30-rc5-tip #2 [ 21.402007] Call Trace: [ 21.402007] [] add_lock_to_list+0x53/0xba [ 21.402007] [] ? lookup_mnt+0x19/0x53 [ 21.402007] [] check_prev_add+0x14b/0x1c7 [ 21.402007] [] validate_chain+0x474/0x52a [ 21.402007] [] __lock_acquire+0x342/0x3c7 [ 21.402007] [] lock_acquire+0xc1/0xe5 [ 21.402007] [] ? lookup_mnt+0x19/0x53 [ 21.402007] [] _spin_lock+0x31/0x66 Double the size - as we've done in the past. [ Impact: allow lockdep to cover more locks ] Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/lockdep_internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9a6e8..699a2ac3a0d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -- cgit v1.2.3 From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:40:06 -0400 Subject: tracing: stop stack trace on first empty entry The stack tracer stores eight entries in the ring buffer when an event traces the stack. The output outputs all eight entries regardless of how many entries were recorded. This patch breaks out of the loop when a null entry is discovered. [ Impact: only print the stack that is recorded ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8bd9a2c1a46..489c0e8ada0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (!field->caller[i]) + break; if (i) { if (!trace_seq_puts(s, " <= ")) goto partial; -- cgit v1.2.3 From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:27 +0800 Subject: tracing/filters: add missing unlock in a failure path [ Impact: fix deadlock in a rare case we fail to allocate memory ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 85ad6a8939a..22c29984fe0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_disable_preds(call); replace_filter_string(call->filter, filter_string); @@ -1101,7 +1102,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; @@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); @@ -1145,7 +1147,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; -- cgit v1.2.3 From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:56 +0800 Subject: tracing/filters: fix off-by-one bug We should leave the last slot for the ending '\0'. [ Impact: fix possible crash when the length of an operand is 128 ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22c29984fe0..a7430b16d24 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps) static inline int append_operand_char(struct filter_parse_state *ps, char c) { - if (ps->operand.tail == MAX_FILTER_STR_VAL) + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) return -EINVAL; ps->operand.string[ps->operand.tail++] = c; -- cgit v1.2.3 From 53020fe81eecd0b7be295868ce5850ef8f41074e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 21:26:19 +0200 Subject: perf_counter: Fix perf_output_copy() WARN to account for overflow The simple reservation test in perf_output_copy() failed to take unsigned int overflow into account, fix this. [ Impact: fix false positive warning with more than 4GB of profiling data ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff166c11b69..985be0b662a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1927,7 +1927,11 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; - WARN_ON_ONCE(handle->offset > handle->head); + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); } #define perf_output_put(handle, x) \ -- cgit v1.2.3 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 76 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 985be0b662a..e814ff04d7c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte return NULL; } -u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, @@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} +EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */ + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} +EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */ + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_del_counter(counter, ctx); - hw_perf_restore(perf_flags); + perf_enable(); if (!ctx->task) { /* @@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info) struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; - u64 perf_flags; int err; /* @@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); add_counter_to_ctx(counter, ctx); @@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info) cpuctx->max_pertask--; unlock: - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; - unsigned long pmuflags; unsigned long flags; int err; @@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info) if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; } else { - pmuflags = hw_perf_save_disable(); + perf_disable(); if (counter == leader) err = group_sched_in(counter, cpuctx, ctx, smp_processor_id()); else err = counter_sched_in(counter, cpuctx, ctx, smp_processor_id()); - hw_perf_restore(pmuflags); + perf_enable(); } if (err) { @@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; - u64 flags; spin_lock(&ctx->lock); ctx->is_active = 0; @@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, goto out; update_context_time(ctx); - flags = hw_perf_save_disable(); + perf_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; - u64 flags; int can_add_hw = 1; spin_lock(&ctx->lock); @@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, ctx->timestamp = perf_clock(); - flags = hw_perf_save_disable(); + perf_disable(); /* * First go through the list and put on any pinned groups @@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, can_add_hw = 0; } } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -955,7 +977,6 @@ int perf_counter_task_disable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; if (likely(!ctx->nr_counters)) return 0; @@ -969,7 +990,7 @@ int perf_counter_task_disable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state != PERF_COUNTER_STATE_ERROR) { @@ -978,7 +999,7 @@ int perf_counter_task_disable(void) } } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); @@ -991,7 +1012,6 @@ int perf_counter_task_enable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) @@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state > PERF_COUNTER_STATE_OFF) @@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void) ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); @@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void) static void rotate_ctx(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 perf_flags; if (!ctx->nr_counters) return; @@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_move_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); } @@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child, } else { struct perf_cpu_context *cpuctx; unsigned long flags; - u64 perf_flags; /* * Disable and unlink this counter. @@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child, * could still be processing it: */ local_irq_save(flags); - perf_flags = hw_perf_save_disable(); + perf_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; - hw_perf_restore(perf_flags); + perf_enable(); local_irq_restore(flags); } -- cgit v1.2.3 From f1a11e0576c7a73d759d05d776692b2b2d37172b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2009 19:21:40 +0200 Subject: futex: remove the wait queue The waitqueue which is used in struct futex_q is a leftover from the futexfd implementation. There is no need to use a waitqueue at all, as the waiting task is the only user of it. The waitqueue just adds additional locking and a loop in the wake up path which both can be avoided. We have already a task reference in struct futex_q which is used for PI futexes. Use it for normal futexes as well and just wake up the task directly. The logic of signalling the futex wakeup via setting q->lock_ptr to NULL is kept with the difference that we set it NULL before doing the wakeup. This opens an exit race window vs. a non futex wake up of the to be woken up task, which we prevent with get_task_struct / put_task_struct on the waiter. [ Impact: simplification ] Signed-off-by: Thomas Gleixner --- kernel/futex.c | 58 +++++++++++++++++++++++++--------------------------------- 1 file changed, 25 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index aec8bf89bf4..157bfcd725b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -100,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -111,7 +111,6 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; @@ -694,22 +693,29 @@ retry: */ static void wake_futex(struct futex_q *q) { - plist_del(&q->list, &q->list.plist); + struct task_struct *p = q->task; + /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. */ - wake_up(&q->waiter); + get_task_struct(p); + + plist_del(&q->list, &q->list.plist); /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -1003,7 +1009,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - wake_up(&q->waiter); + wake_up_state(q->task, TASK_NORMAL); } /** @@ -1280,8 +1286,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1575,11 +1579,9 @@ out: * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout - * @wait: the wait_queue to add to the futex_q after queueing in the hb */ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout, - wait_queue_t *wait) + struct hrtimer_sleeper *timeout) { queue_me(q, hb); @@ -1587,19 +1589,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to + * queueing ourselves into the futex hash. This code thus has to * rely on the futex_wake() code removing us from hash when it * wakes us up. */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - - /* - * Add current as the futex_q waiter. We don't remove ourselves from - * the wait_queue because we are the only user of it. - */ - add_wait_queue(&q->waiter, wait); + set_current_state(TASK_INTERRUPTIBLE); /* Arm the timer */ if (timeout) { @@ -1704,7 +1698,6 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct hrtimer_sleeper timeout, *to = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q; @@ -1733,7 +1726,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out; /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; @@ -2147,7 +2140,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; @@ -2191,7 +2183,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); -- cgit v1.2.3 From 548e1ddf255b4ebfb4ef20c08936fd8d4deb3bd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:26 +0200 Subject: perf_counter: remove perf_disable/enable exports Now that ACPI idle doesn't use it anymore, remove the exports. [ Impact: remove dead code/data ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.429826617@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e814ff04d7c..0173738dd54 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -90,14 +90,12 @@ void perf_disable(void) __perf_disable(); hw_perf_disable(); } -EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */ void perf_enable(void) { if (__perf_enable()) hw_perf_enable(); } -EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */ static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -- cgit v1.2.3 From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:27 +0200 Subject: perf_counter: per user mlock gift Instead of a per-process mlock gift for perf-counters, use a per-user gift so that there is less of a DoS potential. [ Impact: allow less worst-case unprivileged memory consumption ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.496182835@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0173738dd54..93f4a0e4b87 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ -int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); @@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + struct user_struct *user = current_user(); unsigned long vma_size; unsigned long nr_pages; + unsigned long user_locked, user_lock_limit; unsigned long locked, lock_limit; + long user_extra, extra; int ret = 0; - long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - extra = nr_pages /* + 1 only account the data pages */; - extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - if (extra < 0) - extra = 0; + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; - locked = vma->vm_mm->locked_vm + extra; + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { ret = -EPERM; @@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; unlock: -- cgit v1.2.3 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 63 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 93f4a0e4b87..0ad1db4f3d6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void) return 0; } +void perf_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + u64 irq_period; + u64 events, period; + s64 delta; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + continue; + + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + continue; + + events = HZ * counter->hw.interrupts * counter->hw.irq_period; + period = div64_u64(events, counter->hw_event.irq_freq); + + delta = (s64)(1 + period - counter->hw.irq_period); + delta >>= 1; + + irq_period = counter->hw.irq_period + delta; + + if (!irq_period) + irq_period = 1; + + counter->hw.irq_period = irq_period; + counter->hw.interrupts = 0; + } + spin_unlock(&ctx->lock); +} + /* * Round-robin a context's counters: */ @@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &curr->perf_counter_ctx; + perf_adjust_freq(&cpuctx->ctx); + perf_adjust_freq(ctx); + perf_counter_cpu_sched_out(cpuctx); __perf_counter_task_sched_out(ctx); @@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->hw.interrupts++; + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; + u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); counter->pmu->read(counter); @@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ret = HRTIMER_NORESTART; } - hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + period = max_t(u64, 10000, counter->hw.irq_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } @@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct pmu *pmu = NULL; - struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: /* @@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) else pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: @@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) break; } - if (pmu) - hwc->irq_period = hw_event->irq_period; - return pmu; } @@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, { const struct pmu *pmu; struct perf_counter *counter; + struct hw_perf_counter *hwc; long err; counter = kzalloc(sizeof(*counter), gfpflags); @@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; + hwc = &counter->hw; + if (hw_event->freq && hw_event->irq_freq) + hwc->irq_period = TICK_NSEC / hw_event->irq_freq; + else + hwc->irq_period = hw_event->irq_period; + /* * we currently do not support PERF_RECORD_GROUP on inherited counters */ -- cgit v1.2.3 From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Apr 2009 10:43:41 +0200 Subject: sched, timers: move calc_load() to scheduler Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/sched.c | 84 +++++++++++++++++++++++++++++++++++++++++------ kernel/sched_idletask.c | 3 +- kernel/time/timekeeping.c | 2 +- kernel/timer.c | 54 ++---------------------------- 4 files changed, 79 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a34..f4eb88153bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) { - unsigned long i, running = 0, uninterruptible = 0; + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - return running + uninterruptible; + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } } /* @@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -9059,6 +9118,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9166,6 +9227,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..52a8bf8931f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..6a21d7af962 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1122,47 +1122,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - /* * This function runs timers and the timer-tq in bottom half context. */ @@ -1186,16 +1145,6 @@ void run_local_timers(void) softlockup_tick(); } -/* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM -- cgit v1.2.3 From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/sched.c | 15 +++++++++++++++ kernel/timer.c | 32 ++++++-------------------------- 2 files changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f4eb88153bd..497c09ba61e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2868,6 +2868,21 @@ static unsigned long calc_load_update; unsigned long avenrun[3]; EXPORT_SYMBOL(avenrun); +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { diff --git a/kernel/timer.c b/kernel/timer.c index 6a21d7af962..a26ed294f93 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1356,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - unsigned long seq; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); -- cgit v1.2.3 From 2e569d36729c8105ae066a9b105068305442cc77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:37:47 +0200 Subject: perf_counter: frequency based adaptive irq_period, 32-bit fix fix: kernel/built-in.o: In function `perf_counter_alloc': perf_counter.c:(.text+0x7ddc7): undefined reference to `__udivdi3' [ Impact: build fix on 32-bit systems ] Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: <1242394667.6642.1887.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0ad1db4f3d6..728a595399b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2942,7 +2942,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hwc = &counter->hw; if (hw_event->freq && hw_event->irq_freq) - hwc->irq_period = TICK_NSEC / hw_event->irq_freq; + hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); else hwc->irq_period = hw_event->irq_period; -- cgit v1.2.3 From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 21:48:08 +1000 Subject: perf_counter: allow arch to supply event misc flags and instruction pointer At present the values we put in overflow events for the misc flags indicating processor mode and the instruction pointer are obtained using the standard user_mode() and instruction_pointer() functions. Those functions tell you where the performance monitor interrupt was taken, which might not be exactly where the counter overflow occurred, for example because interrupts were disabled at the point where the overflow occurred, or because the processor had many instructions in flight and chose to complete some more instructions beyond the one that caused the counter overflow. Some architectures (e.g. powerpc) can supply more precise information about where the counter overflow occurred and the processor mode at that point. This introduces new functions, perf_misc_flags() and perf_instruction_pointer(), which arch code can override to provide more precise information if available. They have default implementations which are identical to the existing code. This also adds a new misc flag value, PERF_EVENT_MISC_HYPERVISOR, for the case where a counter overflow occurred in the hypervisor. We encode the processor mode in the 2 bits previously used to indicate user or kernel mode; the values for user and kernel mode are unchanged and hypervisor mode is indicated by both bits being set. [ Impact: generalize perfcounter core facilities ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 728a595399b..57840a94b16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter, header.size = sizeof(header); header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= user_mode(regs) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + header.misc |= perf_misc_flags(regs); if (record_type & PERF_RECORD_IP) { - ip = instruction_pointer(regs); + ip = perf_instruction_pointer(regs); header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } -- cgit v1.2.3 From 88fc86c283d9c3854e67e4155808027bc2519eb6 Mon Sep 17 00:00:00 2001 From: GeunSik Lim Date: Thu, 14 May 2009 17:23:38 +0900 Subject: tracing: Append prompt in /debug/tracing/README file append prompt in /debug/tracing/README file. This is trivial issue. Fix typo Mini Howto file(README) for ftrace. [ Impact: cleanup ] Signed-off-by: GeunSik Lim Acked-by: Steven Rostedt Cc: williams LKML-Reference: <1242289418.31161.45.camel@centos51> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a884c09006c..cda81ec58d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2380,7 +2380,7 @@ static const char readme_msg[] = "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" + "# echo 0 > /debug/tracing/tracing_enabled\n" ; static ssize_t -- cgit v1.2.3 From 4484079d517c2b6521621be0b1ea246ccc55c7d7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 May 2009 23:30:50 +0200 Subject: PM: check sysdev_suspend(PMSG_FREEZE) return value Check the return value of sysdev_suspend(). I think this was a typo. Without this change, the following "if" check is always false. I also changed the error message so it's distinguishable from the similar message a few lines above. Signed-off-by: Bjorn Helgaas Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e71ca9cd81b..b0dc9e7a0d1 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -241,9 +241,9 @@ static int create_image(int platform_mode) local_irq_disable(); - sysdev_suspend(PMSG_FREEZE); + error = sysdev_suspend(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " + printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } -- cgit v1.2.3 From 8bc2095951517e2c74b8aeeca4685ddd6b16ed4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: perf_counter: Fix inheritance cleanup code Clean up code that open-coded the list_{add,del}_counter() code in __perf_counter_exit_task() which consequently diverged. This could lead to software counter crashes. Also, fold the ctx->nr_counter inc/dec into those functions and clean up some of the related code. [ Impact: fix potential sw counter crash, cleanup ] Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 57840a94b16..59a926d04ba 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } list_add_rcu(&counter->event_entry, &ctx->event_list); + ctx->nr_counters++; } static void @@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + ctx->nr_counters--; + list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); counter->task = NULL; - ctx->nr_counters--; /* * Protect the list operation against NMI by disabling the @@ -276,7 +278,6 @@ retry: * succeed. */ if (!list_empty(&counter->list_entry)) { - ctx->nr_counters--; list_del_counter(counter, ctx); counter->task = NULL; } @@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter, struct perf_counter_context *ctx) { list_add_counter(counter, ctx); - ctx->nr_counters++; counter->prev_state = PERF_COUNTER_STATE_OFF; counter->tstamp_enabled = ctx->time; counter->tstamp_running = ctx->time; @@ -3206,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter, static void sync_child_counter(struct perf_counter *child_counter, struct perf_counter *parent_counter) { - u64 parent_val, child_val; + u64 child_val; - parent_val = atomic64_read(&parent_counter->count); child_val = atomic64_read(&child_counter->count); /* @@ -3240,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -3252,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child, */ if (child != current) { wait_task_inactive(child, 0); - list_del_init(&child_counter->list_entry); update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -3272,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child, group_sched_out(child_counter, cpuctx, child_ctx); update_counter_times(child_counter); - list_del_init(&child_counter->list_entry); - - child_ctx->nr_counters--; + list_del_counter(child_counter, child_ctx); perf_enable(); local_irq_restore(flags); @@ -3288,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child, */ if (parent_counter) { sync_child_counter(child_counter, parent_counter); - list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, - list_entry) { - if (sub->parent) { - sync_child_counter(sub, sub->parent); - free_counter(sub); - } - } free_counter(child_counter); } } @@ -3315,9 +3304,18 @@ void perf_counter_exit_task(struct task_struct *child) if (likely(!child_ctx->nr_counters)) return; +again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) __perf_counter_exit_task(child, child_counter, child_ctx); + + /* + * If the last counter was a group counter, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->counter_list)) + goto again; } /* -- cgit v1.2.3 From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: perf_counter: Fix counter inheritance Srivatsa Vaddagiri reported that a Java workload triggers this warning in kernel/exit.c: WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); Add the inherited counter propagation on self-detach, this could cause counter leaks and incomplete stats in threaded code like the below: #include #include void *thread(void *arg) { sleep(5); return NULL; } void main(void) { pthread_t thr; pthread_create(&thr, NULL, thread, NULL); } Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4741376c8de..16d74f13a3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + __unhash_process(tsk); /* -- cgit v1.2.3 From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix threaded task exit Flushing counters in __exit_signal() with irqs disabled is not a good idea as perf_counter_exit_task() acquires mutexes. So flush it before acquiring the tasklist lock. (Note, we still need a fix for when the PID has been unhashed.) [ Impact: fix crash with inherited counters ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 16d74f13a3e..73affd35e76 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - __unhash_process(tsk); /* @@ -183,6 +177,13 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); -- cgit v1.2.3 From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 May 2009 15:31:38 +0800 Subject: tracing: fix check for return value of register_module_notifier return zero should be correct, so fix it. [ Impact: eliminate incorrect syslog message ] Signed-off-by: Ming Lei Acked-by: Frederic Weisbecker Acked-by: Li Zefan Cc: rostedt@goodmis.org LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b606f45b6c..140699a9a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,7 +2826,7 @@ void __init ftrace_init(void) __stop_mcount_loc); ret = register_module_notifier(&ftrace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); return; -- cgit v1.2.3 From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 09:22:19 +0200 Subject: sched: properly define the sched_group::cpumask and sched_domain::span fields Properly document the variable-size structure tricks we are doing wrt. struct sched_group and sched_domain, and use the field[0] GCC extension instead of defining a vla array. Dont use unions for this, as pointed out by Linus. [ Impact: cleanup, un-confuse Sparse and LLVM ] Reported-by: Jeff Garzik Acked-by: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 497c09ba61e..228acae8821 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; -- cgit v1.2.3 From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Tue, 19 May 2009 09:59:08 +0200 Subject: blktrace: remove debugfs entries on bad path debugfs directory entries for devices are not removed on some of the failure pathes in do_blk_trace_setup(). One way to reproduce is to start blktrace on multiple devices with insufficient Vmalloc space: Devices will fail with a message like this: BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error If so, the respective entries in debugfs (e.g. /sys/kernel/debug/block/sdu) will remain and subsequent attempts to start blktrace on the respective devices will not succeed due to existing directories. [ Impact: fix /debug/tracing file cleanup corner case ] Signed-off-by: Stefan Raspl Acked-by: Li Zefan Cc: Li Zefan Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 05b4747fd87..e3abf55bc8e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); + debugfs_remove(bt->dir); relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); -- cgit v1.2.3 From 64d1304a64477629cb16b75491a77bafe6f86963 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 May 2009 21:20:10 +0200 Subject: futex: setup writeable mapping for futex ops which modify user space data The futex code installs a read only mapping via get_user_pages_fast() even if the futex op function has to modify user space data. The eventual fault was fixed up by futex_handle_fault() which walked the VMA with mmap_sem held. After the cleanup patches which removed the mmap_sem dependency of the futex code commit 4dc5b7a36a49eff97050894cf1b3a9a02523717 (futex: clean up fault logic) removed the private VMA walk logic from the futex code. This change results in a stale RO mapping which is not fixed up. Instead of reintroducing the previous fault logic we set up the mapping in get_user_pages_fast() read/write for all operations which modify user space data. Also handle private futexes in the same way and make the current unconditional access_ok(VERIFY_WRITE) depend on the futex op. Reported-by: Andreas Schwab Signed-off-by: Thomas Gleixner CC: stable@kernel.org --- kernel/futex.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index eef8cd26b5e..d546b2d53a6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -193,6 +193,7 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -203,7 +204,8 @@ static void drop_futex_key_refs(union futex_key *key) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -226,7 +228,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -235,7 +237,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) } again: - err = get_user_pages_fast(address, 1, 0, &page); + err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); if (err < 0) return err; @@ -677,7 +679,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -723,10 +725,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -814,10 +816,10 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, drop_count = 0; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -1140,7 +1142,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.bitset = bitset; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1330,7 +1332,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.pi_state = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -1594,7 +1596,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; -- cgit v1.2.3 From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:08:41 +0200 Subject: perf_counter: fix counter freeing logic Fix counter lifetime bugs which explain the crashes reported by Marcelo Tosatti and Arnaldo Carvalho de Melo. The new rule is: flushing + freeing is only done for a task's own counters, never for other tasks. [ Impact: fix crashes/lockups with inherited counters ] Reported-by: Arnaldo Carvalho de Melo Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Signed-off-by: Ingo Molnar --- kernel/exit.c | 19 +++++++------------ kernel/perf_counter.c | 2 ++ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 73affd35e76..f9dfedd94af 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,12 +178,6 @@ repeat: proc_flush_task(p); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); @@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); @@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 59a926d04ba..7af16d1c480 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + WARN_ON_ONCE(child != current); + child_ctx = &child->perf_counter_ctx; if (likely(!child_ctx->nr_counters)) -- cgit v1.2.3 From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix counter inheritance race Context rotation should not occur when we are in the middle of walking the counter list when inheriting counters ... [ Impact: fix occasionally incorrect perf stat results ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7af16d1c480..4d8f97375f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx->rr_allowed) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); + ctx->rr_allowed = 1; ctx->task = task; } @@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); + parent_ctx->rr_allowed = 0; + barrier(); /* irqs */ + /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: @@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child) break; } + barrier(); /* irqs */ + parent_ctx->rr_allowed = 1; + mutex_unlock(&parent_ctx->mutex); } -- cgit v1.2.3 From c8b15a706d921baed3195407e4f55270112bb3c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:18:50 +0200 Subject: futex: cleanup error exit Reuse the put_key_ref(key2) call in the exit path. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 476603afd14..381125a9f1e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2185,10 +2185,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); - if (ret) { - put_futex_key(fshared, &key2); - goto out; - } + if (ret) + goto out_key2; /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -2282,6 +2280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, out_put_keys: put_futex_key(fshared, &q.key); +out_key2: put_futex_key(fshared, &key2); out: -- cgit v1.2.3 From 1c840c14906d4ddf66c1f4f5daea059aad951c82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:22:40 +0200 Subject: futex: fix restart for early wakeup in futex_wait_requeue_pi() The futex_wait_requeue_pi op should restart unconditionally like futex_lock_pi. The user of that function e.g. pthread_cond_wait can not be interrupted so we do not care about the SA_RESTART flag of the signal. Clean up the FIXMEs. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 381125a9f1e..2aa216e5b59 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2060,7 +2060,7 @@ pi_faulted: * * Returns * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + * <0 - -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2087,15 +2087,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, if (timeout && !timeout->task) ret = -ETIMEDOUT; - else { - /* - * We expect signal_pending(current), but another - * thread may have handled it for us already. - */ - /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if - * the user specified SA_RESTART or not? */ - ret = -ERESTARTSYS; - } + else + ret = -ERESTARTNOINTR; } return ret; } -- cgit v1.2.3 From 2070887fdeacd9c13f3e805e3f0086c9f22a4d93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2009 23:04:59 +0200 Subject: futex: fix restart in wait_requeue_pi If the waiter has been requeued to the outer PI futex and is interrupted by a signal and the thread handles the signal then ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is discarded. That way we return an unexcpected EINTR to user space instead of ending up in futex_lock_pi_restart. But we do not need to restart the syscall because we know that the condition has changed since we have been requeued. If we would simply restart the syscall then we would drop out via the comparison of the user space value with EWOULDBLOCK. The user space side needs to handle EWOULDBLOCK anyway as the enqueueing on the inner futex can race with a requeue/wake. So we can simply return EWOULDBLOCK to user space which also signals that we did not take the outer futex and let user space handle it in the same way it has to handle the requeue/wake race. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 49 +++++++++---------------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 2aa216e5b59..80b5ce71659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,7 +1507,6 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1930,21 +1929,6 @@ uaddr_faulted: goto retry; } -static long futex_lock_pi_restart(struct restart_block *restart) -{ - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - ktime_t t, *tp = NULL; - int fshared = restart->futex.flags & FLAGS_SHARED; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); -} - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -2141,12 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; struct futex_q q; int res, ret; - u32 uval; if (!bitset) return -EINVAL; @@ -2245,30 +2227,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { - ret = -EFAULT; - if (get_user(uval, uaddr2)) - goto out_put_keys; - /* - * We've already been requeued, so restart by calling - * futex_lock_pi() directly, rather then returning to this - * function. + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). */ - ret = -ERESTART_RESTARTBLOCK; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->futex.uaddr = (u32 *)uaddr2; - restart->futex.val = uval; - restart->futex.flags = 0; - if (abs_time) { - restart->futex.flags |= FLAGS_HAS_TIMEOUT; - restart->futex.time = abs_time->tv64; - } - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + ret = -EWOULDBLOCK; } out_put_keys: -- cgit v1.2.3 From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:19 +0200 Subject: perf_counter: Solve the rotate_ctx vs inherit race differently Instead of disabling RR scheduling of the counters, use a different list that does not get rotated to iterate the counters on inheritance. [ Impact: cleanup, optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.237504544@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4d8f97375f3..64113e6d194 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - if (ctx->rr_allowed) - rotate_ctx(ctx); + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); - ctx->rr_allowed = 1; ctx->task = task; } @@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); - parent_ctx->rr_allowed = 0; - barrier(); /* irqs */ - /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { + if (counter != counter->group_leader) + continue; + if (!counter->hw_event.inherit) continue; @@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child) break; } - barrier(); /* irqs */ - parent_ctx->rr_allowed = 1; - mutex_unlock(&parent_ctx->mutex); } -- cgit v1.2.3 From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:20 +0200 Subject: perf_counter: Log irq_period changes For the dynamic irq_period code, log whenever we change the period so that analyzing code can normalize the event flow. [ Impact: add new feature to allow more precise profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.298769743@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 64113e6d194..db02eb16c77 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void) return 0; } -void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; u64 irq_period; @@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx) if (!irq_period) irq_period = 1; + perf_log_period(counter, irq_period); + counter->hw.irq_period = irq_period; counter->hw.interrupts = 0; } @@ -2406,6 +2410,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * + */ + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 period; + } freq_event = { + .header = { + .type = PERF_EVENT_PERIOD, + .misc = 0, + .size = sizeof(freq_event), + }, + .time = sched_clock(), + .period = period, + }; + + if (counter->hw.irq_period == period) + return; + + ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, freq_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ -- cgit v1.2.3 From b986d7ec0f8b7ea3cc7366d80a137fbe839df227 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:21 +0200 Subject: perf_counter: Optimize disable of time based sw counters Currently we call hrtimer_cancel() unconditionally on disable of time based software counters. Avoid when possible. [ Impact: micro-optimize the code ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.388185031@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index db02eb16c77..473ed2cafbf 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2716,7 +2716,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { - hrtimer_cancel(&counter->hw.hrtimer); + if (counter->hw.irq_period) + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -2767,7 +2768,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { - hrtimer_cancel(&counter->hw.hrtimer); + if (counter->hw.irq_period) + hrtimer_cancel(&counter->hw.hrtimer); task_clock_perf_counter_update(counter, counter->ctx->time); } -- cgit v1.2.3 From afedadf23a2c90f3ba0d963282cbe6a6be129494 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:22 +0200 Subject: perf_counter: Optimize sched in/out of counters Avoid a function call for !group counters by directly calling the counter function. [ Impact: micro-optimize the code ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.511933670@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 473ed2cafbf..69d4de81596 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -826,8 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) - group_sched_out(counter, cpuctx, ctx); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter != counter->group_leader) + counter_sched_out(counter, cpuctx, ctx); + else + group_sched_out(counter, cpuctx, ctx); + } } perf_enable(); out: @@ -903,8 +907,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); + if (counter != counter->group_leader) + counter_sched_in(counter, cpuctx, ctx, cpu); + else { + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + } /* * If this pinned group hasn't been scheduled, @@ -932,9 +940,14 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) + if (counter != counter->group_leader) { + if (counter_sched_in(counter, cpuctx, ctx, cpu)) can_add_hw = 0; + } else { + if (group_can_go_on(counter, cpuctx, can_add_hw)) { + if (group_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; + } } } perf_enable(); -- cgit v1.2.3 From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 May 2009 23:04:46 +0800 Subject: ftrace: fix check for return value of register_module_notifier in event_trace_init register_module_notifier() returns zero in the success case. So fix the inverted fail case check in trace events modules handler. [ Impact: fix spurious warning on ftrace initialization] Reported-by: Li Zefan Signed-off-by: Ming Lei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0eec0c55dd8..9e91c4ad7c8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1174,7 +1174,7 @@ static __init int event_trace_init(void) } ret = register_module_notifier(&trace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace events module notifier\n"); return 0; -- cgit v1.2.3 From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2009 20:13:28 +0200 Subject: perf_counter: Fix context removal deadlock Disable the PMU globally before removing a counter from a context. This fixes the following lockup: [22081.741922] ------------[ cut here ]------------ [22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e() [22081.755624] Hardware name: X8DTN [22081.758903] perfcounters: irq loop stuck! [22081.762985] Modules linked in: [22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226 [22081.772432] Call Trace: [22081.774940] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.781993] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.788368] [] ? warn_slowpath_common+0x77/0xa3 [22081.794649] [] ? warn_slowpath_fmt+0x40/0x45 [22081.800696] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.807080] [] ? perf_counter_nmi_handler+0x3f/0x4a [22081.813751] [] ? notifier_call_chain+0x58/0x86 [22081.819951] [] ? notify_die+0x2d/0x32 [22081.825392] [] ? do_nmi+0x8e/0x242 [22081.830538] [] ? nmi+0x1a/0x20 [22081.835342] [] ? selinux_file_free_security+0x0/0x1a [22081.842105] [] ? x86_pmu_disable_counter+0x15/0x41 [22081.848673] <> [] ? x86_pmu_disable+0x86/0x103 [22081.855512] [] ? __perf_counter_remove_from_context+0x0/0xfe [22081.862926] [] ? counter_sched_out+0x30/0xce [22081.868909] [] ? __perf_counter_remove_from_context+0x59/0xfe [22081.876382] [] ? smp_call_function_single+0x6c/0xe6 [22081.882955] [] ? perf_release+0x86/0x14c [22081.888600] [] ? __fput+0xe7/0x195 [22081.893718] [] ? filp_close+0x5b/0x62 [22081.899107] [] ? put_files_struct+0x64/0xc2 [22081.905031] [] ? do_exit+0x1e2/0x6ef [22081.910360] [] ? _spin_lock_irqsave+0x9/0xe [22081.916292] [] ? do_group_exit+0x67/0x93 [22081.921953] [] ? sys_exit_group+0x12/0x16 [22081.927759] [] ? system_call_fastpath+0x16/0x1b [22081.934076] ---[ end trace 3a3936ce3e1b4505 ]--- And could potentially also fix the lockup reported by Marcelo Tosatti. Also, print more debug info in case of a detected lockup. [ Impact: fix lockup ] Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 69d4de81596..08584c16049 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -208,18 +208,17 @@ static void __perf_counter_remove_from_context(void *info) return; spin_lock_irqsave(&ctx->lock, flags); + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. + */ + perf_disable(); counter_sched_out(counter, cpuctx, ctx); counter->task = NULL; - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); list_del_counter(counter, ctx); - perf_enable(); if (!ctx->task) { /* @@ -231,6 +230,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); } -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/exit.c | 3 +- kernel/fork.c | 1 + kernel/perf_counter.c | 218 +++++++++++++++++++++++++++++++------------------- 3 files changed, 139 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af..99ad4063ee4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d32fef4d38e..e72a09f5355 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 08584c16049..06ea3eae886 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -97,6 +97,17 @@ void perf_enable(void) hw_perf_enable(); } +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) + kfree(ctx); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ctx->nr_counters++; } +/* + * Remove a counter from the lists for its context. + * Must be called with counter->mutex and ctx->mutex held. + */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + if (list_empty(&counter->list_entry)) + return; ctx->nr_counters--; list_del_init(&counter->list_entry); @@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); - counter->task = NULL; - list_del_counter(counter, ctx); if (!ctx->task) { @@ -279,7 +294,6 @@ retry: */ if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); - counter->task = NULL; } spin_unlock_irq(&ctx->lock); } @@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info) * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); /* @@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx, return; } - counter->task = task; retry: task_oncpu_function_call(task, __perf_install_in_context, counter); @@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info) * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); counter->prev_state = counter->state; @@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; struct pt_regs *regs; - if (likely(!cpuctx->task_ctx)) + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); @@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (!cpuctx->task_ctx) + return; __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } @@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, void perf_counter_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; + if (likely(!ctx)) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) int perf_counter_task_disable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void) int perf_counter_task_enable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; int cpu; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &curr->perf_counter_ctx; + ctx = curr->perf_counter_ctxp; perf_adjust_freq(&cpuctx->ctx); - perf_adjust_freq(ctx); + if (ctx) + perf_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); - __perf_counter_task_sched_out(ctx); + if (ctx) + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); - perf_counter_task_sched_in(curr, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); } /* @@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; + struct perf_counter_context *tctx; struct task_struct *task; /* @@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); - ctx = &task->perf_counter_ctx; - ctx->task = task; - /* Reuse ptrace permission checks for now. */ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_context(ctx); + put_task_struct(task); return ERR_PTR(-EACCES); } + ctx = task->perf_counter_ctxp; + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!ctx) { + put_task_struct(task); + return ERR_PTR(-ENOMEM); + } + __perf_counter_init_context(ctx, task); + /* + * Make sure other cpus see correct values for *ctx + * once task->perf_counter_ctxp is visible to them. + */ + smp_wmb(); + tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); + if (tctx) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + ctx = tctx; + } + } + return ctx; } @@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + put_ctx(counter->ctx); kfree(counter); } @@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); + perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - + if (!current->perf_counter_ctxp) + return; + comm_event = (struct perf_comm_event){ .task = task, .event = { @@ -2372,7 +2444,7 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); kfree(buf); } @@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; + if (!current->perf_counter_ctxp) + return; mmap_event = (struct perf_mmap_event){ .file = file, @@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -3149,21 +3224,6 @@ err_put_context: goto out_fput; } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - ctx->task = task; -} - /* * inherit a counter from parent task to child task: */ @@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link it up in the child's context: */ - child_counter->task = child; add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; @@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; /* - * If we do not self-reap then we have to wait for the - * child task to unschedule (it will happen for sure), - * so that its counter is at its final count. (This - * condition triggers rarely - child tasks usually get - * off their CPU before the parent has a chance to - * get this far into the reaping action) + * Protect against concurrent operations on child_counter + * due its fd getting closed, etc. */ - if (child != current) { - wait_task_inactive(child, 0); - update_counter_times(child_counter); - list_del_counter(child_counter, child_ctx); - } else { - struct perf_cpu_context *cpuctx; - unsigned long flags; - - /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: - */ - local_irq_save(flags); - perf_disable(); - - cpuctx = &__get_cpu_var(perf_cpu_context); + mutex_lock(&child_counter->mutex); - group_sched_out(child_counter, cpuctx, child_ctx); - update_counter_times(child_counter); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); - list_del_counter(child_counter, child_ctx); - - perf_enable(); - local_irq_restore(flags); - } + mutex_unlock(&child_counter->mutex); parent_counter = child_counter->parent; /* @@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child, * * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. + * (XXX not sure that is true when we get called from flush_old_exec. + * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = &child->perf_counter_ctx; + child_ctx = child->perf_counter_ctxp; - if (likely(!child_ctx->nr_counters)) + if (likely(!child_ctx)) return; + local_irq_save(flags); + __perf_counter_task_sched_out(child_ctx); + child->perf_counter_ctxp = NULL; + local_irq_restore(flags); + + mutex_lock(&child_ctx->mutex); + again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) @@ -3371,6 +3415,10 @@ again: */ if (!list_empty(&child_ctx->counter_list)) goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); } /* @@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; - child_ctx = &child->perf_counter_ctx; - parent_ctx = &parent->perf_counter_ctx; - - __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = NULL; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning: + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. */ - if (likely(!parent_ctx->nr_counters)) + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) + return; + + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) return; + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. -- cgit v1.2.3 From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:27:22 +1000 Subject: perf_counter: Optimize context switch between identical inherited contexts When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 109 +++++++++++++++++++++++++++++++++++++++++++------- kernel/sched.c | 2 +- 2 files changed, 96 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 06ea3eae886..c10055416de 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx) static void put_ctx(struct perf_counter_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); kfree(ctx); + } } static void @@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled++; } /* @@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -203,6 +210,22 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } +/* + * Mark this context as not being a clone of another. + * Called when counters are added to or removed from this context. + * We also increment our generation number so that anything that + * was cloned from this context before this will not match anything + * cloned from this context after this. + */ +static void unclone_ctx(struct perf_counter_context *ctx) +{ + ++ctx->generation; + if (!ctx->parent_ctx) + return; + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; +} + /* * Cross CPU call to remove a performance counter * @@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; + unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -861,6 +889,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, spin_unlock(&ctx->lock); } +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, + struct perf_counter_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && ctx1->nr_enabled == ctx2->nr_enabled; +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, * accessing the counter control register. If a NMI hits, then it will * not restart the counter. */ -void perf_counter_task_sched_out(struct task_struct *task, int cpu) +void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter_context *next_ctx; struct pt_regs *regs; if (likely(!ctx || !cpuctx->task_ctx)) @@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + + next_ctx = next->perf_counter_ctxp; + if (next_ctx && context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + return; + } + __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) if (likely(!ctx)) return; + if (cpuctx->task_ctx == ctx) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -3252,6 +3313,16 @@ inherit_counter(struct perf_counter *parent_counter, if (IS_ERR(child_counter)) return child_counter; + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + /* * Link it up in the child's context: */ @@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter, mutex_lock(&parent_counter->mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - /* - * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - mutex_unlock(&parent_counter->mutex); return child_counter; @@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; + int inherited_all = 1; child->perf_counter_ctxp = NULL; @@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) + if (!counter->hw_event.inherit) { + inherited_all = 0; continue; + } if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) + parent_ctx, child, child_ctx)) { + inherited_all = 0; break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + */ + if (parent_ctx->parent_ctx) { + child_ctx->parent_ctx = parent_ctx->parent_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); } mutex_unlock(&parent_ctx->mutex); diff --git a/kernel/sched.c b/kernel/sched.c index 419a39d0988..4c0d58bce6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5091,7 +5091,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, cpu); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 948cd52906baf1f92aeea2f9b5c515db1b2e592a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 10:40:09 +0900 Subject: sparseirq: Allow early irq_desc allocation Presently non-legacy IRQs have their irq_desc allocated with kzalloc_node(). This assumes that all callers of irq_to_desc_node_alloc() will be sufficiently late in the boot process that kmalloc is available. While porting sparseirq support to sh this blew up immediately, as at the time that we register the CPU's interrupt vector map only bootmem is available. Check slab_is_available() to work out which path to use. [ Impact: fix SH early boot crash with sparseirq enabled ] Signed-off-by: Paul Mundt Acked-by: Yinghai Lu Cc: Andrew Morton Cc: Mel Gorman LKML-Reference: <20090522014008.GA2806@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a3c671e0f16..18041a254d3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -81,11 +82,16 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int node, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { void *ptr; - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one @@ -186,7 +192,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; @@ -208,7 +214,11 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) goto out_unlock; - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); -- cgit v1.2.3 From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:55 +0200 Subject: perf_counter: Fix dynamic irq_period logging We call perf_adjust_freq() from perf_counter_task_tick() which is is called under the rq->lock causing lock recursion. However, it's no longer required to be called under the rq->lock, so remove it from under it. Also, fix up some related comments. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.476197912@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 ++- kernel/sched.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c10055416de..2f410ea2cb3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * + * Log irq_period changes so that analyzing tools can re-normalize the + * event flow. */ static void perf_log_period(struct perf_counter *counter, u64 period) diff --git a/kernel/sched.c b/kernel/sched.c index 4c0d58bce6b..ad079f07c9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4875,9 +4875,10 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); -- cgit v1.2.3 From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:56 +0200 Subject: perf_counter: Sanitize counter->mutex s/counter->mutex/counter->child_mutex/ and make sure its only used to protect child_list. The usage in __perf_counter_exit_task() doesn't appear to be problematic since ctx->mutex also covers anything related to fd tear-down. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.533186528@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2f410ea2cb3..679c3b5bb7d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx) } } +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) /* * Remove a counter from the lists for its context. - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; mutex_lock(&ctx->mutex); - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - - mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); free_counter(counter); @@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) return -EINVAL; @@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); func(counter); list_for_each_entry(child, &counter->child_list, child_list) func(child); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static void perf_counter_for_each(struct perf_counter *counter, @@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); perf_counter_for_each_sibling(counter, func); list_for_each_entry(child, &counter->child_list, child_list) perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, if (!group_leader) group_leader = counter; - mutex_init(&counter->mutex); + mutex_init(&counter->child_mutex); + INIT_LIST_HEAD(&counter->child_list); + INIT_LIST_HEAD(&counter->list_entry); INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); @@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); - INIT_LIST_HEAD(&counter->child_list); - counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; @@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link this into the parent counter's child list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); return child_counter; } @@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter, /* * Remove this counter from the parent's list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); /* * Release the parent counter, if this was the last @@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; - /* - * Protect against concurrent operations on child_counter - * due its fd getting closed, etc. - */ - mutex_lock(&child_counter->mutex); - update_counter_times(child_counter); list_del_counter(child_counter, child_ctx); - mutex_unlock(&child_counter->mutex); - parent_counter = child_counter->parent; /* * It can happen that parent exits first, and has counters -- cgit v1.2.3 From 682076ae1de0aba9c2da509f7b19dc03e30a6e1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:57 +0200 Subject: perf_counter: Sanitize context locking Ensure we're consistent with the context locks. context->mutex context->lock list_{add,del}_counter(); so that either lock is sufficient to stabilize the context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.618790733@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 679c3b5bb7d..d162d2f0b27 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -597,6 +597,8 @@ static void add_counter_to_ctx(struct perf_counter *counter, /* * Cross CPU call to install and enable a performance counter + * + * Must be called with ctx->mutex held */ static void __perf_install_in_context(void *info) { @@ -1496,13 +1498,13 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, struct perf_counter_context *ctx = counter->ctx; struct perf_counter *sibling; - spin_lock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); counter = counter->group_leader; func(counter); list_for_each_entry(sibling, &counter->sibling_list, list_entry) func(sibling); - spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); } static void perf_counter_for_each_child(struct perf_counter *counter, @@ -3414,7 +3416,10 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; update_counter_times(child_counter); + + spin_lock_irq(&child_ctx->lock); list_del_counter(child_counter, child_ctx); + spin_unlock_irq(&child_ctx->lock); parent_counter = child_counter->parent; /* -- cgit v1.2.3 From aa9c67f53d1969cf1db4c9c2db3a78c4ceb96469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:59 +0200 Subject: perf_counter: Simplify context cleanup Use perf_counter_remove_from_context() to remove counters from the context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.796275849@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d162d2f0b27..0e97f896133 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3416,10 +3416,7 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; update_counter_times(child_counter); - - spin_lock_irq(&child_ctx->lock); - list_del_counter(child_counter, child_ctx); - spin_unlock_irq(&child_ctx->lock); + perf_counter_remove_from_context(child_counter); parent_counter = child_counter->parent; /* -- cgit v1.2.3 From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:00 +0200 Subject: perf_counter: Change pctrl() behaviour Instead of en/dis-abling all counters acting on a particular task, en/dis- able all counters we created. [ v2: fix crash on first counter enable ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.916937244@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 87 ++++++++++++++------------------------------------- 1 file changed, 24 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e97f896133..4c86a636976 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } -int perf_counter_task_disable(void) +int perf_counter_task_enable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - } - - perf_enable(); - - spin_unlock_irqrestore(&ctx->lock, flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_enable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } -int perf_counter_task_enable(void) +int perf_counter_task_disable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - int cpu; - - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - cpu = smp_processor_id(); - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state > PERF_COUNTER_STATE_OFF) - continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - counter->hw_event.disabled = 0; - } - perf_enable(); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(curr, cpu); - - local_irq_restore(flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_disable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } @@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file) perf_counter_remove_from_context(counter); mutex_unlock(&ctx->mutex); + mutex_lock(&counter->owner->perf_counter_mutex); + list_del_init(&counter->owner_entry); + mutex_unlock(&counter->owner->perf_counter_mutex); + put_task_struct(counter->owner); + free_counter(counter); put_context(ctx); @@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open, perf_install_in_context(ctx, counter, cpu); mutex_unlock(&ctx->mutex); + counter->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_counter_mutex); + list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); + mutex_unlock(¤t->perf_counter_mutex); + fput_light(counter_file, fput_needed2); out_fput: @@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child) child->perf_counter_ctxp = NULL; + mutex_init(&child->perf_counter_mutex); + INIT_LIST_HEAD(&child->perf_counter_list); + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. -- cgit v1.2.3 From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:01 +0200 Subject: perf_counter: Remove perf_counter_context::nr_enabled now that pctrl() no longer disables other people's counters, remove the PMU cache code that deals with that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163013.032998331@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c86a636976..cb4062559b4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled++; } /* @@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1, struct perf_counter_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && ctx1->nr_enabled == ctx2->nr_enabled; + && ctx1->parent_gen == ctx2->parent_gen; } /* -- cgit v1.2.3 From a3862d3f814ce7dfca9eed56ac23d29db3aee8d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 May 2009 09:02:37 +0200 Subject: perf_counter: Increase mmap limit In a default 'perf top' run the tool will create a counter for each online CPU. With enough CPUs this will eventually exhaust the default limit. So scale it up with the number of online CPUs. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index cb4062559b4..6cdf8248eda 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1704,6 +1704,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) user_extra = nr_pages + 1; user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; extra = 0; -- cgit v1.2.3 From 32bdfac5462d777f35b00838893c4f87baf23efe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 24 May 2009 21:15:07 +0200 Subject: PM: Do not hold dpm_list_mtx while disabling/enabling nonboot CPUs We shouldn't hold dpm_list_mtx while executing [disable|enable]_nonboot_cpus(), because theoretically this may lead to a deadlock as shown by the following example (provided by Johannes Berg): CPU 3 CPU 2 CPU 1 suspend/hibernate something: rtnl_lock() device_pm_lock() -> mutex_lock(&dpm_list_mtx) mutex_lock(&dpm_list_mtx) linkwatch_work -> rtnl_lock() disable_nonboot_cpus() -> flush CPU 3 workqueue Fortunately, device drivers are supposed to stop any activities that might lead to the registration of new device objects way before disable_nonboot_cpus() is called, so it shouldn't be necessary to hold dpm_list_mtx over the entire late part of device suspend and early part of device resume. Thus, during the late suspend and the early resume of devices acquire dpm_list_mtx only when dpm_list is going to be traversed and release it right after that. This patch is reported to fix the regressions tracked as http://bugzilla.kernel.org/show_bug.cgi?id=13245. Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern Reported-by: Miles Lane Tested-by: Ming Lei --- kernel/kexec.c | 2 -- kernel/power/disk.c | 21 +++------------------ kernel/power/main.c | 7 +------ 3 files changed, 4 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e495..e4983770913 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1451,7 +1451,6 @@ int kernel_kexec(void) error = device_suspend(PMSG_FREEZE); if (error) goto Resume_console; - device_pm_lock(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* * device_power_down() now. Otherwise, drivers for @@ -1489,7 +1488,6 @@ int kernel_kexec(void) enable_nonboot_cpus(); device_power_up(PMSG_RESTORE); Resume_devices: - device_pm_unlock(); device_resume(PMSG_RESTORE); Resume_console: resume_console(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b0dc9e7a0d1..5cb080e7eeb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,8 +215,6 @@ static int create_image(int platform_mode) if (error) return error; - device_pm_lock(); - /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -227,7 +225,7 @@ static int create_image(int platform_mode) if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Unlock; + return error; } error = platform_pre_snapshot(platform_mode); @@ -280,9 +278,6 @@ static int create_image(int platform_mode) device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: - device_pm_unlock(); - return error; } @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode) { int error; - device_pm_lock(); - error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Unlock; + return error; } error = platform_pre_restore(platform_mode); @@ -403,9 +396,6 @@ static int resume_target_kernel(bool platform_mode) device_power_up(PMSG_RECOVER); - Unlock: - device_pm_unlock(); - return error; } @@ -464,11 +454,9 @@ int hibernation_platform_enter(void) goto Resume_devices; } - device_pm_lock(); - error = device_power_down(PMSG_HIBERNATE); if (error) - goto Unlock; + goto Resume_devices; error = hibernation_ops->prepare(); if (error) @@ -493,9 +481,6 @@ int hibernation_platform_enter(void) device_power_up(PMSG_RESTORE); - Unlock: - device_pm_unlock(); - Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a75ea..868028280d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -289,12 +289,10 @@ static int suspend_enter(suspend_state_t state) { int error; - device_pm_lock(); - if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - goto Done; + return error; } error = device_power_down(PMSG_SUSPEND); @@ -343,9 +341,6 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); - Done: - device_pm_unlock(); - return error; } -- cgit v1.2.3 From d5a877e8dd409d8c702986d06485c374b705d340 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 24 May 2009 13:03:43 -0700 Subject: async: make sure independent async domains can't accidentally entangle The problem occurs when async_synchronize_full_domain() is called when the async_pending list is not empty. This will cause lowest_running() to return the cookie of the first entry on the async_pending list, which might be nothing at all to do with the domain being asked for and thus cause the domain synchronization to wait for an unrelated domain. This can cause a deadlock if domain synchronization is used from one domain to wait for another. Fix by running over the async_pending list to see if any pending items actually belong to our domain (and return their cookies if they do). Signed-off-by: James Bottomley Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 968ef9457d4..50540301ed0 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,23 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + async_cookie_t ret = next_cookie; /* begin with "infinity" value */ + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); - return entry->cookie; - } else if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, - struct async_entry, list); - return entry->cookie; - } else { - /* nothing in progress... next_cookie is "infinity" */ - return next_cookie; + ret = entry->cookie; } + if (!list_empty(&async_pending)) { + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) { + ret = entry->cookie; + break; + } + } + + return ret; } static async_cookie_t lowest_in_progress(struct list_head *running) -- cgit v1.2.3 From e4cbb4e3ac8b09fdb11e39e5a5611bfab0a7cd1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 15:50:30 +0200 Subject: perf_counter: Move child perfcounter init to after scheduler init Initialize a task's perfcounters (inherit from parent, etc.) after the child task's scheduler fields have been initialized already. [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e72a09f5355..675e01e9072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -984,7 +984,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; rt_mutex_init_task(p); - perf_counter_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); @@ -1096,6 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + perf_counter_init_task(p); if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; -- cgit v1.2.3 From 771d7cde144d87f2d1fbee4da3c6234d61f7e42a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:26 +0200 Subject: perf_counter: Make pctrl() affect inherited counters too Paul noted that the new ptcrl() didn't work on child counters. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.203151469@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6cdf8248eda..217dbcce2eb 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1067,30 +1067,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_enable(counter); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_disable(counter); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) @@ -1505,6 +1481,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +int perf_counter_task_enable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_enable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + +int perf_counter_task_disable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_disable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch -- cgit v1.2.3 From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:27 +0200 Subject: perf_counter: Propagate inheritance failures down the fork() path Fail fork() when we fail inheritance for some reason (-ENOMEM most likely). Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.324656474@chello.nl> Signed-off-by: Ingo Molnar --- kernel/fork.c | 6 +++++- kernel/perf_counter.c | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 675e01e9072..c07c3335cea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - perf_counter_init_task(p); + + retval = perf_counter_init_task(p); + if (retval) + goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; @@ -1295,6 +1298,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: + perf_counter_exit_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 217dbcce2eb..7a7a144870e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3434,18 +3434,23 @@ again: /* * Initialize the perf_counter context in task_struct */ -void perf_counter_init_task(struct task_struct *child) +int perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; + int ret = 0; child->perf_counter_ctxp = NULL; mutex_init(&child->perf_counter_mutex); INIT_LIST_HEAD(&child->perf_counter_list); + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) + return 0; + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. @@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child) child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); if (!child_ctx) - return; - - parent_ctx = parent->perf_counter_ctxp; - if (likely(!parent_ctx || !parent_ctx->nr_counters)) - return; + return -ENOMEM; __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; @@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child) continue; } - if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) { + ret = inherit_group(counter, parent, parent_ctx, + child, child_ctx); + if (ret) { inherited_all = 0; break; } @@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child) } mutex_unlock(&parent_ctx->mutex); + + return ret; } static void __cpuinit perf_counter_init_cpu(int cpu) -- cgit v1.2.3 From 10989fb2451763fae6f42d85fa6106c8fd010cf5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:28 +0200 Subject: perf_counter: Fix PERF_COUNTER_CONTEXT_SWITCHES for cpu counters Ingo noticed that cpu counters had 0 context switches, even though there was plenty scheduling on the cpu. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.419025548@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7a7a144870e..14b1fe98483 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -924,14 +924,13 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_counter_context *next_ctx; struct pt_regs *regs; + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); - next_ctx = next->perf_counter_ctxp; if (next_ctx && context_equiv(ctx, next_ctx)) { task->perf_counter_ctxp = next_ctx; -- cgit v1.2.3 From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sysctl.c | 8 +++++++ 2 files changed, 63 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 14b1fe98483..ec9c4007a7f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ +int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 irq_period; + u64 interrupts, irq_period; u64 events, period; s64 delta; @@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; + interrupts = counter->hw.interrupts; + counter->hw.interrupts = 0; + + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(counter, 1); + counter->pmu->unthrottle(counter); + interrupts = 2*sysctl_perf_counter_limit/HZ; + } + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) continue; - events = HZ * counter->hw.interrupts * counter->hw.irq_period; + events = HZ * interrupts * counter->hw.irq_period; period = div64_u64(events, counter->hw_event.irq_freq); delta = (s64)(1 + period - counter->hw.irq_period); @@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) perf_log_period(counter, irq_period); counter->hw.irq_period = irq_period; - counter->hw.interrupts = 0; } spin_unlock(&ctx->lock); } @@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period) perf_output_end(&handle); } +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + } throttle_event = { + .header = { + .type = PERF_EVENT_THROTTLE + 1, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = sched_clock(), + }; + + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ @@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); + int throttle = counter->pmu->unthrottle != NULL; int ret = 0; - counter->hw.interrupts++; + if (!throttle) { + counter->hw.interrupts++; + } else if (counter->hw.interrupts != MAX_INTERRUPTS) { + counter->hw.interrupts++; + if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { + counter->hw.interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } /* * XXX event_limit might not quite work as expected on inherited diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3cb1849f598..0c4bf863afa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_int_limit", + .data = &sysctl_perf_counter_limit, + .maxlen = sizeof(sysctl_perf_counter_limit), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3 From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 22:03:26 +0200 Subject: perf_counter: fix warning & lockup - remove bogus warning - fix wakeup from NMI path lockup - also fix up whitespace noise in perf_counter.h Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ec9c4007a7f..070f92d3232 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .time = sched_clock(), }; - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; @@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - WARN_ON_ONCE(child != current); - child_ctx = child->perf_counter_ctxp; if (likely(!child_ctx)) -- cgit v1.2.3 From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 18 May 2009 19:35:34 +0800 Subject: tracing: add trace_event_read_lock() I found that there is nothing to protect event_hash in ftrace_find_event(). Rcu protects the event hashlist but not the event itself while we use it after its extraction through ftrace_find_event(). This lack of a proper locking in this spot opens a race window between any event dereferencing and module removal. Eg: --Task A-- print_trace_line(trace) { event = find_ftrace_event(trace) --Task B-- trace_module_remove_events(mod) { list_trace_events_module(ev, mod) { unregister_ftrace_event(ev->event) { hlist_del(ev->event->node) list_del(....) } } } |--> module removed, the event has been dropped --Task A-- event->print(trace); // Dereferencing freed memory If the event retrieved belongs to a module and this module is concurrently removed, we may end up dereferencing a data from a freed module. RCU could solve this, but it would add latency to the kernel and forbid tracers output callbacks to call any sleepable code. So this fix converts 'trace_event_mutex' to a read/write semaphore, and adds trace_event_read_lock() to protect ftrace_find_event(). [ Impact: fix possible freed memory dereference in ftrace ] Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt LKML-Reference: <4A114806.7090302@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_output.c | 25 ++++++++++++++++++------- kernel/trace/trace_output.h | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd40d232034..02d32baa23a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) p = s_next(m, p, &l); } + trace_event_read_lock(); return p; } static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); + trace_event_read_unlock(); } static void print_lat_help_header(struct seq_file *m) @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +/* Called with trace_event_read_lock() held. */ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -3008,6 +3011,7 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; + trace_event_read_lock(); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3024,6 +3028,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out_err; } + trace_event_read_lock(); + /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 489c0e8ada0..7136420603a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DEFINE_MUTEX(trace_event_mutex); +static DECLARE_RWSEM(trace_event_mutex); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state) * @type: the type of event to look for * * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. */ struct trace_event *ftrace_find_event(int type) { @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type) key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, n, &event_hash[key], node) { if (event->type == type) return event; } @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list) return last + 1; } +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); if (WARN_ON(!event)) goto out; @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event) key = event->type & (EVENT_HASHSIZE - 1); - hlist_add_head_rcu(&event->node, &event_hash[key]); + hlist_add_head(&event->node, &event_hash[key]); ret = event->type; out: - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return ret; } @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); */ int unregister_ftrace_event(struct trace_event *event) { - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); hlist_del(&event->node); list_del(&event->list); - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return 0; } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 6e220a8e570..ac240e76eb0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, -- cgit v1.2.3 From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:11:59 +0800 Subject: ftrace: Add task_comm support for trace_event If we enable a trace event alone without any tracer running (such as function tracer, sched switch tracer, etc...) it can't output enough task command information. We need to use the tracing_{start/stop}_cmdline_record() helpers which are designed to keep track of cmdlines for any tasks that were scheduled during the tracing. Before this patch: # echo 1 > debugfs/tracing/events/sched/sched_switch/enable # cat debugfs/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-2289 [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120] <...>-2289 [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140] -0 [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115] <...>-5 [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... After this patch: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | bash-2269 [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120] bash-2269 [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140] -0 [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115] events/0-5 [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... Changelog: v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in ENABLE_EVENT_TRACING v2->v3: v2 can solve problem that was caused by config EVENT_TRACING alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is selected by other config, compile fail happened again. This version solves it. [ Impact: fix incomplete output of event tracing ] Signed-off-by: Zhao Lei Cc: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 9 +++++++-- kernel/trace/trace_events.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f61be301578..a508b9d2adb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER default y config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + select MARKERS bool config TRACING @@ -176,10 +181,10 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER +config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" select TRACING - select MARKERS + select CONTEXT_SWITCH_TRACER help This tracer gets called from the context switch and records all switching of tasks. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9e91c4ad7c8..9b246eb01d5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -85,6 +85,7 @@ static void ftrace_clear_events(void) if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } } @@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } break; case 1: if (!call->enabled) { call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); } break; @@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod) found = true; if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } if (call->event) @@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void) } call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); event_test_stuff(); call->unregfunc(); + tracing_stop_cmdline_record(); call->enabled = 0; pr_cont("OK\n"); -- cgit v1.2.3 From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:13:59 +0800 Subject: ftrace: clean up of using ftrace_event_enable_disable() Always use ftrace_event_enable_disable() to enable/disable an event so that we can factorize out the event toggling code. [ Impact: factorize and cleanup event tracing code ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9b246eb01d5..6c81f9c2142 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call) #endif /* CONFIG_MODULES */ -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } - } - mutex_unlock(&event_mutex); -} - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { - switch (enable) { case 0: if (call->enabled) { @@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } + ftrace_event_enable_disable(call, 0); if (call->event) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); @@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void) continue; } - call->enabled = 1; - tracing_start_cmdline_record(); - call->regfunc(); - + ftrace_event_enable_disable(call, 1); event_test_stuff(); - - call->unregfunc(); - tracing_stop_cmdline_record(); - call->enabled = 0; + ftrace_event_enable_disable(call, 0); pr_cont("OK\n"); } -- cgit v1.2.3 From 329d876d6fd326109f191ae0fb2798b8834fb70b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: perf_counter: Initialize ->oncpu properly This shouldnt matter normally (and i have not seen any misbehavior), because active counters always have a proper ->oncpu value - but nevertheless initialize the field properly to -1. [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 070f92d3232..367299f91aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3122,6 +3122,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + counter->oncpu = -1; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; -- cgit v1.2.3 From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 May 2009 20:25:22 +0200 Subject: tracing: add __print_flags for events Developers have been asking for the ability in the ftrace event tracer to display names of bits in a flags variable. Instead of printing out c2, it would be easier to read FOO|BAR|GOO, assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7. Some examples where this would be useful are the state flags in a context switch, kmalloc flags, and even permision flags in accessing files. [ v2 changes include: Frederic Weisbecker's idea of using a mask instead of bits, thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS. Li Zefan's idea of allowing the caller of __print_flags to add their own delimiter (or no delimiter) where we can get for file permissions rwx instead of r|w|x. ] [ v3 changes: Christoph Hellwig's idea of using an array instead of va_args. ] [ Impact: better displaying of flags in trace output ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_output.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7136420603a..a4840c260c8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -15,6 +15,9 @@ #define EVENT_HASHSIZE 128 static DECLARE_RWSEM(trace_event_mutex); + +DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); + static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 0; } +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + int i; + + trace_seq_init(p); + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 May 2009 19:21:47 -0400 Subject: tracing: add __print_symbolic to trace events This patch adds __print_symbolic which is similar to __print_flags but works for an enumeration type instead. That is, there is only a one to one mapping between the values and the symbols. When a match is made, then it is printed, otherwise the hex value is outputed. [ Impact: add interface for showing symbol names in events ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_output.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a4840c260c8..c12d95db2f5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + + trace_seq_init(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From ab2b7ebaad16226c9a5e85c5f384d19fa58a7459 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 May 2009 09:11:03 +0100 Subject: kmod: Release sub_info on cred allocation failure. call_usermodehelper_setup() forgot to kfree(sub_info) when prepare_usermodehelper_creds() failed. Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675251e..7e95bedb2bf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->argv = argv; sub_info->envp = envp; sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) + if (!sub_info->cred) { + kfree(sub_info); return NULL; + } out: return sub_info; -- cgit v1.2.3 From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 May 2009 17:28:02 +0200 Subject: trace: disable preemption before taking raw spinlocks s390 code uses smp_processor_id() in __raw_spin_lock() code which reveals that a (raw) spinlock is taken without preemption disabled. This can potentially deadlock. To fix this explicitly disable and enable preemption. BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278 caller is trace_find_cmdline+0x40/0xfc CPU: 0 Not tainted 2.6.30-rc7-dirty #39 Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988) 000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000 000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6 0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b 000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000 0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28 Call Trace: ([<00000000000174b2>] show_trace+0x112/0x170) [<0000000000017582>] show_stack+0x72/0x100 [<0000000000441538>] dump_stack+0xc8/0xd8 [<000000000025c350>] debug_smp_processor_id+0x114/0x130 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc [<00000000000c35d4>] trace_print_context+0x58/0xac [<00000000000bb676>] print_trace_line+0x416/0x470 [<00000000000bc8fe>] s_show+0x4e/0x428 [<000000000013834e>] seq_read+0x36a/0x5d4 [<0000000000112a78>] vfs_read+0xc8/0x174 [<0000000000112c58>] SyS_read+0x74/0xc4 [<000000000002c7ae>] sysc_noemu+0x10/0x16 [<000002000012436c>] 0x2000012436c 1 lock held by cat/2278: #0: (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4 [ Impact: fix preempt-unsafe raw spinlock ] Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02d32baa23a..a3a8a87d7e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[]) return; } + preempt_disable(); __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) @@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[]) strcpy(comm, "<...>"); __raw_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } void tracing_record_cmdline(struct task_struct *tsk) -- cgit v1.2.3 From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 28 May 2009 22:18:17 +1000 Subject: perf_counter: Fix race in attaching counters to tasks and exiting Commit 564c2b21 ("perf_counter: Optimize context switch between identical inherited contexts") introduced a race where it is possible that a counter being attached to a task could get attached to the wrong task, if the task is one that has inherited its context from another task via fork. This happens because the optimized context switch could switch the context to another task after find_get_context has read task->perf_counter_ctxp. In fact, it's possible that the context could then get freed, if the other task then exits. This fixes the problem by protecting both the context switch and the critical code in find_get_context with spinlocks. The context switch locks the cxt->lock of both the outgoing and incoming contexts before swapping them. That means that once code such as find_get_context has obtained the spinlock for the context associated with a task, the context can't get swapped to another task. However, the context may have been swapped in the interval between reading task->perf_counter_ctxp and getting the lock, so it is necessary to check and retry. To make sure that none of the contexts being looked at in find_get_context can get freed, this changes the context freeing code to use RCU. Thus an rcu_read_lock() is sufficient to ensure that no contexts can get freed. This part of the patch is lifted from a patch posted by Peter Zijlstra. This also adds a check to make sure that we can't add a counter to a task that is exiting. There is also a race between perf_counter_exit_task and find_get_context; this solves the race by moving the get_ctx that was in perf_counter_alloc into the locked region in find_get_context, so that once find_get_context has got the context for a task, it won't get freed even if the task calls perf_counter_exit_task. It doesn't matter if new top-level (non-inherited) counters get attached to the context after perf_counter_exit_task has detached the context from the task. They will just stay there and never get scheduled in until the counters' fds get closed, and then perf_release will remove them from the context and eventually free the context. With this, we are now doing the unclone in find_get_context rather than when a counter was added to or removed from a context (actually, we were missing the unclone_ctx() call when adding a counter to a context). We don't need to unclone when removing a counter from a context because we have no way to remove a counter from a cloned context. This also takes out the smp_wmb() in find_get_context, which Peter Zijlstra pointed out was unnecessary because the cmpxchg implies a full barrier anyway. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 205 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 145 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 367299f91aa..52e5a15321d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx) atomic_inc(&ctx->refcount); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_counter_context *ctx; + + ctx = container_of(head, struct perf_counter_context, rcu_head); + kfree(ctx); +} + static void put_ctx(struct perf_counter_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - kfree(ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -211,22 +221,6 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } -/* - * Mark this context as not being a clone of another. - * Called when counters are added to or removed from this context. - * We also increment our generation number so that anything that - * was cloned from this context before this will not match anything - * cloned from this context after this. - */ -static void unclone_ctx(struct perf_counter_context *ctx) -{ - ++ctx->generation; - if (!ctx->parent_ctx) - return; - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; -} - /* * Cross CPU call to remove a performance counter * @@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info) * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. */ static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; - unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info) /* * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. */ static void perf_counter_disable(struct perf_counter *counter) { @@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info) /* * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. */ static void perf_counter_enable(struct perf_counter *counter) { @@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; struct perf_counter_context *next_ctx; + struct perf_counter_context *parent; struct pt_regs *regs; + int do_switch = 1; regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); @@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task, return; update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_counter_ctxp; - if (next_ctx && context_equiv(ctx, next_ctx)) { - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - return; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); } + rcu_read_unlock(); - __perf_counter_sched_out(ctx, cpuctx); - - cpuctx->task_ctx = NULL; + if (do_switch) { + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } } static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) @@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx, ctx->task = task; } -static void put_context(struct perf_counter_context *ctx) -{ - if (ctx->task) - put_task_struct(ctx->task); -} - static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct perf_counter_context *tctx; + struct perf_counter_context *parent_ctx; struct task_struct *task; + int err; /* * If cpu is not a wildcard then this is a percpu counter: @@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; + get_ctx(ctx); return ctx; } @@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); + /* + * Can't attach counters to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + /* Reuse ptrace permission checks for now. */ - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_task_struct(task); - return ERR_PTR(-EACCES); + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry_lock: + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more and we can + * unclone it if necessary. + * Once it's not a clone things will be stable. + */ + spin_lock_irq(&ctx->lock); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + parent_ctx = ctx->parent_ctx; + if (parent_ctx) { + put_ctx(parent_ctx); + ctx->parent_ctx = NULL; /* no longer a clone */ + } + ++ctx->generation; + /* + * Get an extra reference before dropping the lock so that + * this context won't get freed if the task exits. + */ + get_ctx(ctx); + spin_unlock_irq(&ctx->lock); } + rcu_read_unlock(); - ctx = task->perf_counter_ctxp; if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!ctx) { - put_task_struct(task); - return ERR_PTR(-ENOMEM); - } + err = -ENOMEM; + if (!ctx) + goto errout; __perf_counter_init_context(ctx, task); - /* - * Make sure other cpus see correct values for *ctx - * once task->perf_counter_ctxp is visible to them. - */ - smp_wmb(); - tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); - if (tctx) { + get_ctx(ctx); + if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ kfree(ctx); - ctx = tctx; + goto retry_lock; } + get_task_struct(task); } + put_task_struct(task); return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); } static void free_counter_rcu(struct rcu_head *head) @@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); - put_ctx(counter->ctx); kfree(counter); } @@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter) if (counter->destroy) counter->destroy(counter); + put_ctx(counter->ctx); call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(counter->owner); free_counter(counter); - put_context(ctx); return 0; } @@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, mutex_unlock(&ctx->mutex); } +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ static void perf_counter_for_each_child(struct perf_counter *counter, void (*func)(struct perf_counter *)) { @@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->ctx = ctx; counter->oncpu = -1; - get_ctx(ctx); - counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -3290,7 +3370,7 @@ err_free_put_context: kfree(counter); err_put_context: - put_context(ctx); + put_ctx(ctx); goto out_fput; } @@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; + get_ctx(child_ctx); /* * Make the child state follow the state of the parent counter, @@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child, /* * When a child task exits, feed back counter values to parent counters. - * - * Note: we may be running in child context, but the PID is not hashed - * anymore so new counters will not be added. - * (XXX not sure that is true when we get called from flush_old_exec. - * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { @@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child) local_irq_save(flags); __perf_counter_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_counter_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; + spin_unlock(&child_ctx->lock); local_irq_restore(flags); mutex_lock(&child_ctx->mutex); @@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child) __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; + get_task_struct(child); /* * Lock the parent list. No need to lock the child - not PID -- cgit v1.2.3 From a9862e0560866eadbc59b84867492004da436516 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 19 May 2009 22:49:07 +0200 Subject: Export add_timer_on for modules Needed in followon patch. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin Signed-off-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..e2c47b82ac3 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -756,6 +756,7 @@ void add_timer_on(struct timer_list *timer, int cpu) wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(add_timer_on); /** * del_timer - deactive a timer. -- cgit v1.2.3 From ad3a37de81c45f6c20d410ece86004b98f7b6d84 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 29 May 2009 16:06:20 +1000 Subject: perf_counter: Don't swap contexts containing locked mutex Peter Zijlstra pointed out that under some circumstances, we can take the mutex in a context or a counter and then swap that context or counter to another task, potentially leading to lock order inversions or the mutexes not protecting what they are supposed to protect. This fixes the problem by making sure that we never take a mutex in a context or counter which could get swapped to another task. Most of the cases where we take a mutex is on a top-level counter or context, i.e. a counter which has an fd associated with it or a context that contains such a counter. This adds WARN_ON_ONCE statements to verify that. The two cases where we need to take the mutex on a context that is a clone of another are in perf_counter_exit_task and perf_counter_init_task. The perf_counter_exit_task case is solved by uncloning the context before starting to remove the counters from it. The perf_counter_init_task is a little trickier; we temporarily disable context swapping for the parent (forking) task by setting its ctx->parent_gen to the all-1s value after locking the context, if it is a cloned context, and restore the ctx->parent_gen value at the end if the context didn't get uncloned in the meantime. This also moves the increment of the context generation count to be within the same critical section, protected by the context mutex, that adds the new counter to the context. That way, taking the mutex is sufficient to ensure that both the counter list and the generation count are stable. [ Impact: fix hangs, races with inherited and PID counters ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <18975.31580.520676.619896@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 89 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52e5a15321d..db843f812a6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -919,7 +919,8 @@ static int context_equiv(struct perf_counter_context *ctx1, struct perf_counter_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen; + && ctx1->parent_gen == ctx2->parent_gen + && ctx1->parent_gen != ~0ull; } /* @@ -1339,7 +1340,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) put_ctx(parent_ctx); ctx->parent_ctx = NULL; /* no longer a clone */ } - ++ctx->generation; /* * Get an extra reference before dropping the lock so that * this context won't get freed if the task exits. @@ -1414,6 +1414,7 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_counter_remove_from_context(counter); mutex_unlock(&ctx->mutex); @@ -1445,6 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; @@ -1504,6 +1506,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, struct perf_counter_context *ctx = counter->ctx; struct perf_counter *sibling; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); counter = counter->group_leader; @@ -1524,6 +1527,7 @@ static void perf_counter_for_each_child(struct perf_counter *counter, { struct perf_counter *child; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); func(counter); list_for_each_entry(child, &counter->child_list, child_list) @@ -1536,6 +1540,7 @@ static void perf_counter_for_each(struct perf_counter *counter, { struct perf_counter *child; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); perf_counter_for_each_sibling(counter, func); list_for_each_entry(child, &counter->child_list, child_list) @@ -1741,6 +1746,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_counter *counter = vma->vm_file->private_data; + WARN_ON_ONCE(counter->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { struct user_struct *user = current_user(); @@ -1788,6 +1794,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->mmap_mutex); if (atomic_inc_not_zero(&counter->mmap_count)) { if (nr_pages != counter->data->nr_pages) @@ -3349,8 +3356,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_free_put_context; counter->filp = counter_file; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, counter, cpu); + ++ctx->generation; mutex_unlock(&ctx->mutex); counter->owner = current; @@ -3436,6 +3445,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link this into the parent counter's child list */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); mutex_lock(&parent_counter->child_mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); mutex_unlock(&parent_counter->child_mutex); @@ -3485,6 +3495,7 @@ static void sync_child_counter(struct perf_counter *child_counter, /* * Remove this counter from the parent's list */ + WARN_ON_ONCE(parent_counter->ctx->parent_ctx); mutex_lock(&parent_counter->child_mutex); list_del_init(&child_counter->child_list); mutex_unlock(&parent_counter->child_mutex); @@ -3527,12 +3538,17 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - child_ctx = child->perf_counter_ctxp; - - if (likely(!child_ctx)) + if (likely(!child->perf_counter_ctxp)) return; local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_counter_ctxp; __perf_counter_task_sched_out(child_ctx); /* @@ -3542,6 +3558,15 @@ void perf_counter_exit_task(struct task_struct *child) */ spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; + if (child_ctx->parent_ctx) { + /* + * This context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the counters from it. + */ + put_ctx(child_ctx->parent_ctx); + child_ctx->parent_ctx = NULL; + } spin_unlock(&child_ctx->lock); local_irq_restore(flags); @@ -3571,9 +3596,11 @@ again: int perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter_context *cloned_ctx; struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; + u64 cloned_gen; int ret = 0; child->perf_counter_ctxp = NULL; @@ -3581,8 +3608,7 @@ int perf_counter_init_task(struct task_struct *child) mutex_init(&child->perf_counter_mutex); INIT_LIST_HEAD(&child->perf_counter_list); - parent_ctx = parent->perf_counter_ctxp; - if (likely(!parent_ctx || !parent_ctx->nr_counters)) + if (likely(!parent->perf_counter_ctxp)) return 0; /* @@ -3599,6 +3625,34 @@ int perf_counter_init_task(struct task_struct *child) child->perf_counter_ctxp = child_ctx; get_task_struct(child); + /* + * If the parent's context is a clone, temporarily set its + * parent_gen to an impossible value (all 1s) so it won't get + * swapped under us. The rcu_read_lock makes sure that + * parent_ctx continues to exist even if it gets swapped to + * another process and then freed while we are trying to get + * its lock. + */ + rcu_read_lock(); + retry: + parent_ctx = rcu_dereference(parent->perf_counter_ctxp); + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + spin_lock_irq(&parent_ctx->lock); + if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) { + spin_unlock_irq(&parent_ctx->lock); + goto retry; + } + cloned_gen = parent_ctx->parent_gen; + if (parent_ctx->parent_ctx) + parent_ctx->parent_gen = ~0ull; + spin_unlock_irq(&parent_ctx->lock); + rcu_read_unlock(); + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. @@ -3630,10 +3684,15 @@ int perf_counter_init_task(struct task_struct *child) /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of counters and the generation + * count can't have changed since we took the mutex. */ - if (parent_ctx->parent_ctx) { - child_ctx->parent_ctx = parent_ctx->parent_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = cloned_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; @@ -3643,6 +3702,16 @@ int perf_counter_init_task(struct task_struct *child) mutex_unlock(&parent_ctx->mutex); + /* + * Restore the clone status of the parent. + */ + if (parent_ctx->parent_ctx) { + spin_lock_irq(&parent_ctx->lock); + if (parent_ctx->parent_ctx) + parent_ctx->parent_gen = cloned_gen; + spin_unlock_irq(&parent_ctx->lock); + } + return ret; } -- cgit v1.2.3 From 3f4dee227348daac32f36daad9a91059efd0723e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 May 2009 11:25:09 +0200 Subject: perf_counter: Fix cpuctx->task_ctx races Peter noticed that we are sometimes reading cpuctx->task_ctx with interrupts enabled. Noticed-by: Peter Zijlstra Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index db843f812a6..eb346048f00 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -234,15 +234,18 @@ static void __perf_counter_remove_from_context(void *info) struct perf_counter_context *ctx = counter->ctx; unsigned long flags; + local_irq_save(flags); /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. */ - if (ctx->task && cpuctx->task_ctx != ctx) + if (ctx->task && cpuctx->task_ctx != ctx) { + local_irq_restore(flags); return; + } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * counters on a global level. @@ -382,14 +385,17 @@ static void __perf_counter_disable(void *info) struct perf_counter_context *ctx = counter->ctx; unsigned long flags; + local_irq_save(flags); /* * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) + if (ctx->task && cpuctx->task_ctx != ctx) { + local_irq_restore(flags); return; + } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); /* * If the counter is on, turn it off. @@ -615,6 +621,7 @@ static void __perf_install_in_context(void *info) unsigned long flags; int err; + local_irq_save(flags); /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been @@ -623,12 +630,14 @@ static void __perf_install_in_context(void *info) * on this cpu because it had no counters. */ if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) + if (cpuctx->task_ctx || ctx->task != current) { + local_irq_restore(flags); return; + } cpuctx->task_ctx = ctx; } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -745,17 +754,20 @@ static void __perf_counter_enable(void *info) unsigned long flags; int err; + local_irq_save(flags); /* * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) + if (cpuctx->task_ctx || ctx->task != current) { + local_irq_restore(flags); return; + } cpuctx->task_ctx = ctx; } - spin_lock_irqsave(&ctx->lock, flags); + spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); -- cgit v1.2.3 From 012b84dae17126d8b5d159173091eb3db5a2bc43 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:08:41 +0200 Subject: perf_counter: Robustify counter-free logic This fixes a nasty crash and highlights a bug that we were freeing failed-fork() counters incorrectly. (the fix for that will come separately) [ Impact: fix crashes/lockups with inherited counters ] Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index eb346048f00..616c52426b3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1004,6 +1004,10 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) if (!cpuctx->task_ctx) return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } -- cgit v1.2.3 From efb3d17240d80e27508d238809168120fe4b93a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2009 14:25:58 +0200 Subject: perf_counter: Fix COMM and MMAP events for cpu wide counters Commit a63eaf34ae6 ("perf_counter: Dynamically allocate tasks' perf_counter_context struct") broke COMM and MMAP notification for cpu wide counters by dropping out early if there was no task context, thereby also not iterating the cpu context. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 616c52426b3..58d6d198faa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2443,9 +2443,9 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + if (cpuctx->task_ctx) + perf_counter_comm_ctx(cpuctx->task_ctx, comm_event); put_cpu_var(perf_cpu_context); - - perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2454,8 +2454,6 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - if (!current->perf_counter_ctxp) - return; comm_event = (struct perf_comm_event){ .task = task, @@ -2570,10 +2568,10 @@ got_name: cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + if (cpuctx->task_ctx) + perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); - kfree(buf); } @@ -2584,8 +2582,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; - if (!current->perf_counter_ctxp) - return; mmap_event = (struct perf_mmap_event){ .file = file, -- cgit v1.2.3 From 665c2142a94202881a3c11cbaee6506cb10ada2d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2009 14:51:57 +0200 Subject: perf_counter: Clean up task_ctx vs interrupts Remove the local_irq_save() etc.. in routines that are smp function calls, or have IRQs disabled by other means. Then change the COMM, MMAP, and swcounter context iteration to current->perf_counter_ctxp and RCU, since it really doesn't matter which context they iterate, they're all folded. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 82 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 58d6d198faa..0c000d305e0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -232,18 +232,14 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - local_irq_save(flags); /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - local_irq_restore(flags); + if (ctx->task && cpuctx->task_ctx != ctx) return; - } spin_lock(&ctx->lock); /* @@ -267,7 +263,7 @@ static void __perf_counter_remove_from_context(void *info) } perf_enable(); - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } @@ -383,17 +379,13 @@ static void __perf_counter_disable(void *info) struct perf_counter *counter = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - local_irq_save(flags); /* * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - local_irq_restore(flags); + if (ctx->task && cpuctx->task_ctx != ctx) return; - } spin_lock(&ctx->lock); @@ -411,7 +403,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* @@ -618,10 +610,8 @@ static void __perf_install_in_context(void *info) struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); - unsigned long flags; int err; - local_irq_save(flags); /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been @@ -630,10 +620,8 @@ static void __perf_install_in_context(void *info) * on this cpu because it had no counters. */ if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) { - local_irq_restore(flags); + if (cpuctx->task_ctx || ctx->task != current) return; - } cpuctx->task_ctx = ctx; } @@ -687,7 +675,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* @@ -751,19 +739,15 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; - unsigned long flags; int err; - local_irq_save(flags); /* * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) { - local_irq_restore(flags); + if (cpuctx->task_ctx || ctx->task != current) return; - } cpuctx->task_ctx = ctx; } @@ -811,7 +795,7 @@ static void __perf_counter_enable(void *info) } unlock: - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); } /* @@ -981,6 +965,10 @@ void perf_counter_task_sched_out(struct task_struct *task, spin_lock(&ctx->lock); spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_counter_ctxp + */ task->perf_counter_ctxp = next_ctx; next->perf_counter_ctxp = ctx; ctx->task = next; @@ -998,6 +986,9 @@ void perf_counter_task_sched_out(struct task_struct *task, } } +/* + * Called with IRQs disabled + */ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1012,6 +1003,9 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) cpuctx->task_ctx = NULL; } +/* + * Called with IRQs disabled + */ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { __perf_counter_sched_out(&cpuctx->ctx, cpuctx); @@ -2431,6 +2425,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx, static void perf_counter_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; unsigned int size; char *comm = comm_event->task->comm; @@ -2443,9 +2438,17 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - if (cpuctx->task_ctx) - perf_counter_comm_ctx(cpuctx->task_ctx, comm_event); put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_comm_ctx(ctx, comm_event); + rcu_read_unlock(); } void perf_counter_comm(struct task_struct *task) @@ -2536,6 +2539,7 @@ static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; struct file *file = mmap_event->file; unsigned int size; char tmp[16]; @@ -2568,10 +2572,18 @@ got_name: cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - if (cpuctx->task_ctx) - perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event); put_cpu_var(perf_cpu_context); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + kfree(buf); } @@ -2882,6 +2894,7 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); + struct perf_counter_context *ctx; if (*recursion) goto out; @@ -2891,10 +2904,15 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs, addr); - if (cpuctx->task_ctx) { - perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs, addr); - } + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); + rcu_read_unlock(); barrier(); (*recursion)--; -- cgit v1.2.3 From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2009 14:25:58 +0200 Subject: perf_counter: Ammend cleanup in fork() fail When fork() fails we cannot use perf_counter_exit_task() since that assumes to operate on current. Write a new helper that cleans up unused/clean contexts. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c07c3335cea..23bf757ed32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1298,7 +1298,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_exit_task(p); + perf_counter_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0c000d305e0..79c3f26541d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3538,8 +3538,7 @@ static void sync_child_counter(struct perf_counter *child_counter, } static void -__perf_counter_exit_task(struct task_struct *child, - struct perf_counter *child_counter, +__perf_counter_exit_task(struct perf_counter *child_counter, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; @@ -3605,7 +3604,7 @@ void perf_counter_exit_task(struct task_struct *child) again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) - __perf_counter_exit_task(child, child_counter, child_ctx); + __perf_counter_exit_task(child_counter, child_ctx); /* * If the last counter was a group counter, it will have appended all @@ -3620,6 +3619,44 @@ again: put_ctx(child_ctx); } +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter *counter, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { + struct perf_counter *parent = counter->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&counter->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_counter(counter, ctx); + free_counter(counter); + } + + if (!list_empty(&ctx->counter_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + /* * Initialize the perf_counter context in task_struct */ -- cgit v1.2.3 From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:48:12 +1000 Subject: perf_counter: Provide functions for locking and pinning the context for a task This abstracts out the code for locking the context associated with a task. Because the context might get transferred from one task to another concurrently, we have to check after locking the context that it is still the right context for the task and retry if not. This was open-coded in find_get_context() and perf_counter_init_task(). This adds a further function for pinning the context for a task, i.e. marking it so it can't be transferred to another task. This adds a 'pin_count' field to struct perf_counter_context to indicate that a context is pinned, instead of the previous method of setting the parent_gen count to all 1s. Pinning the context with a pin_count is easier to undo and doesn't require saving the parent_gen value. This also adds a perf_unpin_context() to undo the effect of perf_pin_task_context() and changes perf_counter_init_task to use it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 128 +++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 79c3f26541d..da8dfef4b47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -122,6 +122,69 @@ static void put_ctx(struct perf_counter_context *ctx) } } +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context *perf_lock_task_context( + struct task_struct *task, unsigned long *flags) +{ + struct perf_counter_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_counter_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + get_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + /* * Add a counter from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1, { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen - && ctx1->parent_gen != ~0ull; + && !ctx1->pin_count && !ctx2->pin_count; } /* @@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) struct perf_counter_context *ctx; struct perf_counter_context *parent_ctx; struct task_struct *task; + unsigned long flags; int err; /* @@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry_lock: - rcu_read_lock(); retry: - ctx = rcu_dereference(task->perf_counter_ctxp); + ctx = perf_lock_task_context(task, &flags); if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more and we can - * unclone it if necessary. - * Once it's not a clone things will be stable. - */ - spin_lock_irq(&ctx->lock); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } parent_ctx = ctx->parent_ctx; if (parent_ctx) { put_ctx(parent_ctx); @@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) * this context won't get freed if the task exits. */ get_ctx(ctx); - spin_unlock_irq(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } - rcu_read_unlock(); if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); @@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) * the context they set. */ kfree(ctx); - goto retry_lock; + goto retry; } get_task_struct(task); } @@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; - u64 cloned_gen; int ret = 0; child->perf_counter_ctxp = NULL; @@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child) get_task_struct(child); /* - * If the parent's context is a clone, temporarily set its - * parent_gen to an impossible value (all 1s) so it won't get - * swapped under us. The rcu_read_lock makes sure that - * parent_ctx continues to exist even if it gets swapped to - * another process and then freed while we are trying to get - * its lock. + * If the parent's context is a clone, pin it so it won't get + * swapped under us. */ - rcu_read_lock(); - retry: - parent_ctx = rcu_dereference(parent->perf_counter_ctxp); + parent_ctx = perf_pin_task_context(parent); + /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ - spin_lock_irq(&parent_ctx->lock); - if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) { - spin_unlock_irq(&parent_ctx->lock); - goto retry; - } - cloned_gen = parent_ctx->parent_gen; - if (parent_ctx->parent_ctx) - parent_ctx->parent_gen = ~0ull; - spin_unlock_irq(&parent_ctx->lock); - rcu_read_unlock(); /* * Lock the parent list. No need to lock the child - not PID @@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child) cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = cloned_gen; + child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; @@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child) mutex_unlock(&parent_ctx->mutex); - /* - * Restore the clone status of the parent. - */ - if (parent_ctx->parent_ctx) { - spin_lock_irq(&parent_ctx->lock); - if (parent_ctx->parent_ctx) - parent_ctx->parent_gen = cloned_gen; - spin_unlock_irq(&parent_ctx->lock); - } + perf_unpin_context(parent_ctx); return ret; } -- cgit v1.2.3 From 880ca15adf2392770a68047e7a98e076ff4d21da Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:49:14 +1000 Subject: perf_counter: Allow software counters to count while task is not running This changes perf_swcounter_match() so that per-task software counters can count events that occur while their associated task is not running. This will allow us to use the generic software counter code for counting task migrations, which can occur while the task is not scheduled in. To do this, we have to distinguish between the situations where the counter is inactive because its task has been scheduled out, and those where the counter is inactive because it is part of a group that was not able to go on the PMU. In the former case we want the counter to count, but not in the latter case. If the context is active, we have the latter case. If the context is inactive then we need to know whether the counter was counting when the context was last active, which we can determine by comparing its ->tstamp_stopped timestamp with the context's timestamp. This also folds three checks in perf_swcounter_match, checking perf_event_raw(), perf_event_type() and perf_event_id() individually, into a single 64-bit comparison on counter->hw_event.config, as an optimization. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.34810.259718.955621@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index da8dfef4b47..ff8b4636f84 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2867,20 +2867,56 @@ static void perf_swcounter_overflow(struct perf_counter *counter, } +static int perf_swcounter_is_counting(struct perf_counter *counter) +{ + struct perf_counter_context *ctx; + unsigned long flags; + int count; + + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + return 1; + + if (counter->state != PERF_COUNTER_STATE_INACTIVE) + return 0; + + /* + * If the counter is inactive, it could be just because + * its task is scheduled out, or because it's in a group + * which could not go on the PMU. We want to count in + * the first case but not the second. If the context is + * currently active then an inactive software counter must + * be the second case. If it's not currently active then + * we need to know whether the counter was active when the + * context was last active, which we can determine by + * comparing counter->tstamp_stopped with ctx->time. + * + * We are within an RCU read-side critical section, + * which protects the existence of *ctx. + */ + ctx = counter->ctx; + spin_lock_irqsave(&ctx->lock, flags); + count = 1; + /* Re-check state now we have the lock */ + if (counter->state < PERF_COUNTER_STATE_INACTIVE || + counter->ctx->is_active || + counter->tstamp_stopped < ctx->time) + count = 0; + spin_unlock_irqrestore(&ctx->lock, flags); + return count; +} + static int perf_swcounter_match(struct perf_counter *counter, enum perf_event_types type, u32 event, struct pt_regs *regs) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; + u64 event_config; - if (perf_event_raw(&counter->hw_event)) - return 0; + event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event; - if (perf_event_type(&counter->hw_event) != type) + if (!perf_swcounter_is_counting(counter)) return 0; - if (perf_event_id(&counter->hw_event) != event) + if (counter->hw_event.config != event_config) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) -- cgit v1.2.3 From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 1 Jun 2009 10:13:37 +0200 Subject: perf_counter: Tidy up style details - whitespace fixlets - make local variable definitions more consistent [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff8b4636f84..df319c48c52 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -16,8 +16,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include @@ -65,7 +65,9 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } -int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { @@ -127,8 +129,8 @@ static void put_ctx(struct perf_counter_context *ctx) * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ -static struct perf_counter_context *perf_lock_task_context( - struct task_struct *task, unsigned long *flags) +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_counter_context *ctx; @@ -1330,9 +1332,9 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; struct perf_counter_context *parent_ctx; + struct perf_counter_context *ctx; + struct perf_cpu_context *cpuctx; struct task_struct *task; unsigned long flags; int err; @@ -1664,8 +1666,8 @@ int perf_counter_task_disable(void) */ void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_mmap_data *data; struct perf_counter_mmap_page *userpg; + struct perf_mmap_data *data; rcu_read_lock(); data = rcu_dereference(counter->data); @@ -1769,10 +1771,11 @@ fail: static void __perf_mmap_data_free(struct rcu_head *rcu_head) { - struct perf_mmap_data *data = container_of(rcu_head, - struct perf_mmap_data, rcu_head); + struct perf_mmap_data *data; int i; + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) free_page((unsigned long)data->data_pages[i]); @@ -1801,8 +1804,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_counter *counter = vma->vm_file->private_data; WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, - &counter->mmap_mutex)) { + if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { struct user_struct *user = current_user(); atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); @@ -1821,11 +1823,11 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; - unsigned long user_locked, user_lock_limit; - unsigned long locked, lock_limit; long user_extra, extra; int ret = 0; @@ -1900,8 +1902,8 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct perf_counter *counter = filp->private_data; struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_counter *counter = filp->private_data; int retval; mutex_lock(&inode->i_mutex); @@ -2412,8 +2414,8 @@ static void perf_counter_output(struct perf_counter *counter, */ struct perf_comm_event { - struct task_struct *task; - char *comm; + struct task_struct *task; + char *comm; int comm_size; struct { @@ -2932,6 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3526,7 +3529,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Make the child state follow the state of the parent counter, * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. + * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) child_counter->state = PERF_COUNTER_STATE_INACTIVE; -- cgit v1.2.3 From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 15:15:51 +0800 Subject: ftrace, workqueuetrace: make workqueue tracepoints use TRACE_EVENT macro v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format introduced by Steven Rostedt: consolidate trace and trace_event headers v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap the work addr v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to the tracepoints: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related tracepoints. [ Impact: expand workqueue tracer to events tracing ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 2 +- kernel/workqueue.c | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 984b9175c13..cfe56d31d85 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -6,7 +6,7 @@ */ -#include +#include #include #include #include "trace_stat.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f71fb2a0895..0668795d881 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,7 +33,8 @@ #include #include #include -#include +#define CREATE_TRACE_POINTS +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -DEFINE_TRACE(workqueue_insertion); - static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { @@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -DEFINE_TRACE(workqueue_execution); - static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } -DEFINE_TRACE(workqueue_creation); - static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -DEFINE_TRACE(workqueue_destruction); - static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* -- cgit v1.2.3 From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:58:26 +0800 Subject: trace_workqueue: use list_for_each_entry() instead of list_for_each_entry_safe() No need to use list_for_each_entry_safe() in iteration without deleting any node, we can use list_for_each_entry() instead. [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index cfe56d31d85..128b64b93f1 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { atomic_inc(&node->inserted); goto found; @@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { node->executed++; goto found; -- cgit v1.2.3 From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:59:36 +0800 Subject: trace_workqueue: remove cpu_workqueue_stats->first_entry cpu_workqueue_stats->first_entry is useless because we can retrieve the header of a cpu workqueue using: if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next) [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 128b64b93f1..890974aed64 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -16,8 +16,6 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; -/* Useful to know if we print the cpu headers */ - bool first_entry; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_empty(&workqueue_cpu_stat(cpu)->list)) - cws->first_entry = true; list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } -- cgit v1.2.3 From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 15:02:17 +0800 Subject: trace_workqueue: remove blank line between each cpu The blankline between each cpu's workqueue stat is not necessary, because the cpu number is enough to part them by eye. Old style also caused a blankline below headline, and made code complex by using lock, disableirq and get cpu var. Old style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... New style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... [ Impact: provide more readable code ] Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 890974aed64..97fcea4acce 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx) static int workqueue_stat_show(struct seq_file *s, void *p) { struct cpu_workqueue_stats *cws = p; - unsigned long flags; - int cpu = cws->cpu; struct pid *pid; struct task_struct *tsk; - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); -- cgit v1.2.3 From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 05:58:49 +0200 Subject: tracing/stat: replace trace_stat_session by stat_session The "trace" prefix in struct trace_stat_session type is annoying while reading the trace_stat.c file. It makes the lines longer, and is not that much useful to explain the sense of this type. Just keep "struct stat_session" for this type. [ Impact: make the code a bit more readable ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index fdde3a4a94c..3b6816be825 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -22,7 +22,7 @@ struct trace_stat_list { }; /* A stat session is the stats output in one file */ -struct tracer_stat_session { +struct stat_session { struct list_head session_list; struct tracer_stat *ts; struct list_head stat_list; @@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); static struct dentry *stat_dir; -static void reset_stat_session(struct tracer_stat_session *session) +static void reset_stat_session(struct stat_session *session) { struct trace_stat_list *node, *next; @@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session) INIT_LIST_HEAD(&session->stat_list); } -static void destroy_session(struct tracer_stat_session *session) +static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); reset_stat_session(session); @@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2) * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ -static int stat_seq_init(struct tracer_stat_session *session) +static int stat_seq_init(struct stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; @@ -154,7 +154,7 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); @@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; if (p == SEQ_START_TOKEN) return seq_list_start(&session->stat_list, *pos); @@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) static void stat_seq_stop(struct seq_file *s, void *p) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; mutex_unlock(&session->stat_mutex); } static int stat_seq_show(struct seq_file *s, void *v) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); if (v == SEQ_START_TOKEN) @@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - struct tracer_stat_session *session = inode->i_private; + struct stat_session *session = inode->i_private; ret = seq_open(file, &trace_stat_seq_ops); if (!ret) { @@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) */ static int tracing_stat_release(struct inode *i, struct file *f) { - struct tracer_stat_session *session = i->i_private; + struct stat_session *session = i->i_private; mutex_lock(&session->stat_mutex); reset_stat_session(session); @@ -251,7 +251,7 @@ static int tracing_stat_init(void) return 0; } -static int init_stat_file(struct tracer_stat_session *session) +static int init_stat_file(struct stat_session *session) { if (!stat_dir && tracing_stat_init()) return -ENODEV; @@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *session, *node, *tmp; + struct stat_session *session, *node, *tmp; int ret; if (!trace) @@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); if (!session) return -ENOMEM; @@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace) void unregister_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *node, *tmp; + struct stat_session *node, *tmp; mutex_lock(&all_stat_sessions_mutex); list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { -- cgit v1.2.3 From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 06:24:36 +0200 Subject: tracing/stat: replace linked list by an rbtree for sorting When the stat tracing framework prepares the entries from a tracer to output them to the user, it starts by computing a linear sort through a linked list to give the entries ordered by relevance to the user. This is quite ugly and causes a small latency when we begin to read the file. This patch changes that by turning the linked list into a red-black tree. Athough the whole iteration using the start and next tracer callbacks while opening the file remain the same, it is now much more fast and scalable. The rbtree guarantees O(log(n)) insertions whereas a linked list with linear sorting brought us a O(n) despair. Now the (visible) latency has disapeared. [ Impact: kill the latency while starting to read a stat tracer file ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 140 +++++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 3b6816be825..0bd0fc82da5 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,7 +1,7 @@ /* * Infrastructure for statistic tracing (histogram output). * - * Copyright (C) 2008 Frederic Weisbecker + * Copyright (C) 2008-2009 Frederic Weisbecker * * Based on the code from trace_branch.c which is * Copyright (C) 2008 Steven Rostedt @@ -10,14 +10,19 @@ #include +#include #include #include "trace_stat.h" #include "trace.h" -/* List of stat entries from a tracer */ -struct trace_stat_list { - struct list_head list; +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; void *stat; }; @@ -25,7 +30,7 @@ struct trace_stat_list { struct stat_session { struct list_head session_list; struct tracer_stat *ts; - struct list_head stat_list; + struct rb_root stat_root; struct mutex stat_mutex; struct dentry *file; }; @@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct rb_node *node) +{ + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + return NULL; + if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; + + snode = container_of(node, struct stat_node, node); + kfree(snode); + + return parent; + } +} static void reset_stat_session(struct stat_session *session) { - struct trace_stat_list *node, *next; + struct rb_node *node = session->stat_root.rb_node; - list_for_each_entry_safe(node, next, &session->stat_list, list) - kfree(node); + while (node) + node = release_next(node); - INIT_LIST_HEAD(&session->stat_list); + session->stat_root = RB_ROOT; } static void destroy_session(struct stat_session *session) @@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session) kfree(session); } +typedef int (*cmp_stat_t)(void *, void *); + +static void +insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + /* * For tracers that don't provide a stat_cmp callback. * This one will force an immediate insertion on tail of @@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2) */ static int stat_seq_init(struct stat_session *session) { - struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; + struct stat_node *new_entry; + struct rb_root *root; void *stat; int ret = 0; int i; @@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session) * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. */ - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit; } - - INIT_LIST_HEAD(&new_entry->list); - - list_add(&new_entry->list, &session->stat_list); + root = &session->stat_root; + insert_stat(root, new_entry, dummy_cmp); new_entry->stat = stat; @@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit_free_list; } - INIT_LIST_HEAD(&new_entry->list); new_entry->stat = stat; - list_for_each_entry_reverse(iter_entry, &session->stat_list, - list) { - - /* Insertion with a descendent sorting */ - if (ts->stat_cmp(iter_entry->stat, - new_entry->stat) >= 0) { - - list_add(&new_entry->list, &iter_entry->list); - break; - } - } - - /* The current larger value */ - if (list_empty(&new_entry->list)) - list_add(&new_entry->list, &session->stat_list); + insert_stat(root, new_entry, ts->stat_cmp); } + exit: mutex_unlock(&session->stat_mutex); return ret; @@ -155,25 +204,38 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node; + int i; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) + if (!*pos && session->ts->stat_headers) { + (*pos)++; return SEQ_START_TOKEN; + } - return seq_list_start(&session->stat_list, *pos); + node = rb_first(&session->stat_root); + for (i = 0; node && i < *pos; i++) + node = rb_next(node); + + (*pos)++; + + return node; } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; if (p == SEQ_START_TOKEN) - return seq_list_start(&session->stat_list, *pos); + return rb_first(&session->stat_root); - return seq_list_next(p, &session->stat_list, pos); + return rb_next(node); } static void stat_seq_stop(struct seq_file *s, void *p) @@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p) static int stat_seq_show(struct seq_file *s, void *v) { struct stat_session *session = s->private; - struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + struct stat_node *l = container_of(v, struct stat_node, node); if (v == SEQ_START_TOKEN) return session->ts->stat_headers(s); @@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); + session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); - INIT_LIST_HEAD(&session->stat_list); mutex_init(&session->stat_mutex); - session->file = NULL; ret = init_stat_file(session); if (ret) { -- cgit v1.2.3 From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:26 +0800 Subject: tracing/stat: change dummpy_cmp() to return -1 Currently the output of trace_stat/workqueues is totally reversed: # cat /debug/tracing/trace_stat/workqueues ... 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 3796 3796 kblockd/1:120 ... The correct output should be: 1 3796 3796 kblockd/1:120 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 It's caused by "tracing/stat: replace linked list by an rbtree for sorting" (53059c9b67a62a3dc8c80204d3da42b9267ea5a0). dummpy_cmp() should return -1, so rb_node will always be inserted as right-most node in the rbtree, thus we sort the output in ascending order. [ Impact: fix the output of trace_stat/workqueues ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 0bd0fc82da5..5816d1aebcc 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) */ static int dummy_cmp(void *p1, void *p2) { - return 1; + return -1; } /* -- cgit v1.2.3 From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:48 +0800 Subject: tracing/stat: remember to free root node When closing a trace_stat file, we destroy the rbtree constructed during file open, but there is memory leak that the root node is not freed. [ Impact: fix memory leak when closing a trace_stat file ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 5816d1aebcc..8030ec98dba 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node) return node->rb_right; else { if (!parent) - return NULL; - if (parent->rb_left == node) + ; + else if (parent->rb_left == node) parent->rb_left = NULL; else parent->rb_right = NULL; -- cgit v1.2.3 From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:42:46 +0800 Subject: tracing/stat: do some cleanups - remove duplicate code in stat_seq_init() - update comments to reflect the change from stat list to stat rbtree [ Impact: clean up ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 54 ++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8030ec98dba..17f20ebdad2 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session) typedef int (*cmp_stat_t)(void *, void *); -static void -insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; /* * Figure out where to put new node @@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); + return 0; } /* * For tracers that don't provide a stat_cmp callback. - * This one will force an immediate insertion on tail of - * the list. + * This one will force an insertion as right-most node + * in the rbtree. */ static int dummy_cmp(void *p1, void *p2) { @@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2) } /* - * Initialize the stat list at each trace_stat file opening. + * Initialize the stat rbtree at each trace_stat file opening. * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ static int stat_seq_init(struct stat_session *session) { struct tracer_stat *ts = session->ts; - struct stat_node *new_entry; - struct rb_root *root; + struct rb_root *root = &session->stat_root; void *stat; int ret = 0; int i; @@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session) if (!stat) goto exit; - /* - * The first entry. Actually this is the second, but the first - * one (the stat_list head) is pointless. - */ - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) goto exit; - } - root = &session->stat_root; - insert_stat(root, new_entry, dummy_cmp); - - new_entry->stat = stat; /* - * Iterate over the tracer stat entries and store them in a sorted - * list. + * Iterate over the tracer stat entries and store them in an rbtree. */ for (i = 1; ; i++) { stat = ts->stat_next(stat, i); @@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; - goto exit_free_list; - } - - new_entry->stat = stat; - - insert_stat(root, new_entry, ts->stat_cmp); + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; } exit: mutex_unlock(&session->stat_mutex); return ret; -exit_free_list: +exit_free_rbtree: reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; @@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) struct rb_node *node; int i; - /* Prevent from tracer switch or stat_list modification */ + /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ @@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) } /* - * Avoid consuming memory with our now useless list. + * Avoid consuming memory with our now useless rbtree. */ static int tracing_stat_release(struct inode *i, struct file *f) { -- cgit v1.2.3 From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 30 May 2009 04:25:30 +0200 Subject: tracing/stat: remove unappropriate safe walk on list register_stat_tracer() uses list_for_each_entry_safe to check whether a tracer is already present in the list. But we don't delete anything from the list here, so we don't need the safe version [ Impact: cleanup list use is stat tracing ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 17f20ebdad2..c00643733f4 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct stat_session *session, *node, *tmp; + struct stat_session *session, *node; int ret; if (!trace) @@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) { mutex_unlock(&all_stat_sessions_mutex); return -EINVAL; -- cgit v1.2.3 From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 13:37:24 -0400 Subject: ftrace: add kernel command line function filtering When using ftrace=function on the command line to trace functions on boot up, one can not filter out functions that are commonly called. This patch adds two new ftrace command line commands. ftrace_notrace=function-list ftrace_filter=function-list Where function-list is a comma separated list of functions to filter. The ftrace_notrace will make the functions listed not be included in the function tracing, and ftrace_filter will only trace the functions listed. These two act the same as the debugfs/tracing/set_ftrace_notrace and debugfs/tracing/set_ftrace_filter respectively. The simple glob expressions that are allowed by the filter files can also be used by the command line interface. ftrace_notrace=rcu*,*lock,*spin* Will not trace any function that starts with rcu, ends with lock, or has the word spin in it. Note, if the self tests are enabled, they may interfere with the filtering set by the command lines. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 140699a9a8a..2074e5b7766 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,7 @@ #include #include +#include #include "trace_output.h" #include "trace_stat.h" @@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) ftrace_set_regex(buf, len, reset, 0); } +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +static void __init set_ftrace_early_filter(char *buf, int enable) +{ + char *func; + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + set_ftrace_early_filter(ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + set_ftrace_early_filter(ftrace_notrace_buf, 0); +} + static int ftrace_regex_release(struct inode *inode, struct file *file, int enable) { @@ -2829,6 +2869,8 @@ void __init ftrace_init(void) if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); + set_ftrace_early_filters(); + return; failed: ftrace_disabled = 1; -- cgit v1.2.3 From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 15:50:13 -0400 Subject: tracing: fix config options to not show when automatically selected There are two options that are selected by all tracers, but we want to have those options available when no tracer is selected. These are The event tracer and sched switch tracer. The are enabled by all tracers, but if a tracer is not selected we want the options to appear. All tracers including them select TRACING. Thus what we would like to do is: config EVENT_TRACER bool "prompt" depends on TRACING select TRACING But that gives us a bug in the kbuild system since we just created a circular dependency. We only want the prompt to show when TRACING is off. This patch adds GENERIC_TRACER that all tracers will select instead of TRACING. The two options (sched switch and event tracer) will select TRACING directly and depend on !GENERIC_TRACER. This solves the cicular dependency. [ Impact: hide options that are selected by default ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a508b9d2adb..6e55cc3ac49 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER select MARKERS bool +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hidding of the automatic options options. + config TRACING bool select DEBUG_FS @@ -66,6 +73,10 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING +config GENERIC_TRACER + bool + select TRACING + # # Minimum requirements an architecture has to meet for us to # be able to offer generic tracing facilities: @@ -95,7 +106,7 @@ config FUNCTION_TRACER depends on HAVE_FUNCTION_TRACER select FRAME_POINTER select KALLSYMS - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done @@ -126,7 +137,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME select TRACE_IRQFLAGS - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in irqs-off critical @@ -147,7 +158,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in preemption off critical @@ -166,7 +177,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace @@ -174,7 +185,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE help @@ -183,6 +194,7 @@ config SCHED_TRACER config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" + depends on !GENERIC_TRACER select TRACING select CONTEXT_SWITCH_TRACER help @@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" + depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel @@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS - select TRACING + select GENERIC_TRACER select KALLSYMS help Basic tracer to catch the syscall entry and exit events. config BOOT_TRACER bool "Trace boot initcalls" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records @@ -228,7 +241,7 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool - select TRACING + select GENERIC_TRACER choice prompt "Branch Profiling" @@ -308,7 +321,7 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" depends on X86 - select TRACING + select GENERIC_TRACER help This tracer helps developers to analyze and optimize the kernels power management decisions, specifically the C-state and P-state @@ -342,14 +355,14 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER bool "Trace hw branches" - select TRACING + select GENERIC_TRACER help This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" - select TRACING + select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected @@ -369,7 +382,7 @@ config KMEMTRACE config WORKQUEUE_TRACER bool "Trace workqueues" - select TRACING + select GENERIC_TRACER help The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the @@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS - select TRACING + select GENERIC_TRACER select STACKTRACE help Say Y here if you want to be able to trace the block layer actions @@ -446,7 +459,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on GENERIC_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI - select TRACING + select GENERIC_TRACER help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap -- cgit v1.2.3 From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 16:31:21 -0400 Subject: tracing: combine the default tracers into one config Both event tracer and sched switch plugin are selected by default by all generic tracers. But if no generic tracer is enabled, their options appear. But ether one of them will select the other, thus it only makes sense to have the default tracers be selected by one option. [ Impact: clean up kconfig menu ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6e55cc3ac49..4a13e5a01ce 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -192,27 +192,14 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config ENABLE_CONTEXT_SWITCH_TRACER - bool "Trace process context switches" - depends on !GENERIC_TRACER - select TRACING - select CONTEXT_SWITCH_TRACER - help - This tracer gets called from the context switch and records - all switching of tasks. - -config ENABLE_EVENT_TRACING - bool "Trace various events in the kernel" +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel allowing the user to pick and choose which trace point they - want to trace. - - Note, all tracers enable event tracing. This option is - only a convenience to enable event tracing when no other - tracers are selected. + want to trace. It also includes the sched_switch tracer plugin. config FTRACE_SYSCALLS bool "Trace syscalls" -- cgit v1.2.3 From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 1 Jun 2009 15:35:46 +0800 Subject: tracing/events: introduce __dynamic_array() __string() is limited: - it's a char array, but we may want to define array with other types - a source string should be available, but we may just know the string size We introduce __dynamic_array() to break those limitations, and __string() becomes a wrapper of it. As a side effect, now __get_str() can be used in TP_fast_assign but not only TP_print. Take XFS for example, we have the string length in the dirent, but the string itself is not NULL-terminated, so __dynamic_array() can be used: TRACE_EVENT(xfs_dir2, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), TP_STRUCT__entry( __field(int, namelen) __dynamic_array(char, name, args->namelen + 1) ... ), TP_fast_assign( char *name = __get_str(name); if (args->namelen) memcpy(name, args->name, args->namelen); name[args->namelen] = '\0'; __entry->namelen = args->namelen; ), TP_printk("name %.*s namelen %d", __entry->namelen ? __get_str(name) : NULL __entry->namelen) ); [ Impact: allow defining dynamic size arrays ] Signed-off-by: Li Zefan LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a7430b16d24..db6e54bdb59 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -478,12 +478,12 @@ enum { static int is_string_field(const char *type) { + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - if (!strcmp(type, "__str_loc")) - return FILTER_DYN_STRING; - return 0; } -- cgit v1.2.3 From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 1 Jun 2009 15:53:35 +0100 Subject: tracing: add exports to use __print_symbolic and __print_flags from a module A patch to allow the use of __print_symbolic and __print_flags from a module. This allows the current GFS2 tracing patch to build. Signed-off-by: Steven Whitehouse LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c12d95db2f5..0fe3b223f7e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -17,6 +17,7 @@ static DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); +EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_flags_seq); const char * ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, @@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_symbols_seq); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) -- cgit v1.2.3 From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 15:16:05 -0400 Subject: tracing: make trace pipe recognize latency format flag The trace_pipe did not recognize the latency format flag and would produce different output than the trace file. The problem was partly due that the trace flags in the iterator was not set as well as the trace_pipe zeros out part of the iterator (including the flags) to be able to use the same routines as the trace file. trace_flags of the iterator should not cause any problems when not zeroed out by for trace_pipe. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3a8a87d7e9..cae34c69752 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); -- cgit v1.2.3 From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 21:51:28 -0400 Subject: ftrace: do not profile functions when disabled A race was found that if one were to enable and disable the function profiler repeatedly, then the system can panic. This was because a profiled function may be preempted just before disabling interrupts. While the profiler is disabled and then reenabled, the preempted function could start again, and access the hash as it is being initialized. This just adds a check in the irq disabled part to check if the profiler is enabled, and if it is not then it will just exit. When the system is disabled, the profile_enabled variable is cleared before calling the unregistering of the function profiler. This unregistering calls stop machine which also acts as a synchronize schedule. [ Impact: fix panic in enabling/disabling function profiler ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2074e5b7766..d6973dfadb3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; rec = ftrace_find_profiled_func(stat, ip); @@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; calltime = trace->rettime - trace->calltime; @@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ unregister_ftrace_profiler(); } } -- cgit v1.2.3 From f38b082081bf69a06fffb8b32a175999e2320c5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 2 Jun 2009 21:05:16 +1000 Subject: perf_counter: Initialize per-cpu context earlier on cpu up This arranges for perf_counter's notifier for cpu hotplug operations to be called earlier than the migration notifier in sched.c by increasing its priority to 20, compared to the 10 for the migration notifier. The reason for doing this is that a subsequent commit to convert the cpu migration counter to use the generic swcounter infrastructure will add a call into the perf_counter subsystem when tasks get migrated. Therefore the perf_counter subsystem needs a chance to initialize its per-cpu data for the new cpu before it can get called from the migration code. This also adds a comment to the migration notifier noting that its priority needs to be lower than that of the perf_counter notifier. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18981.1900.792795.836858@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ kernel/sched.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index df319c48c52..8d2653f137e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3902,8 +3902,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, + .priority = 20, }; void __init perf_counter_init(void) diff --git a/kernel/sched.c b/kernel/sched.c index ad079f07c9c..3226cc132e9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7319,8 +7319,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_counter subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, -- cgit v1.2.3 From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:52:30 +1000 Subject: perf_counter: Fix cpu migration counter This fixes the cpu migration software counter to count correctly even when contexts get swapped from one task to another. Previously the cpu migration counts reported by perf stat were bogus, ranging from negative to several thousand for a single "lat_ctx 2 8 32" run. With this patch the cpu migration count reported for "lat_ctx 2 8 32" is almost always between 35 and 44. This fixes the problem by adding a call into the perf_counter code from set_task_cpu when tasks are migrated. This enables us to use the generic swcounter code (with some modifications) for the cpu migration counter. This modifies the swcounter code to allow a NULL regs pointer to be passed in to perf_swcounter_ctx_event() etc. The cpu migration counter does this because there isn't necessarily a pt_regs struct for the task available. In this case, the counter will not have interrupt capability - but the migration counter didn't have interrupt capability before, so this is no loss. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 74 +++++++++++++++------------------------------------ kernel/sched.c | 1 + 2 files changed, 22 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d2653f137e..cd94cf3bf9e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter, if (counter->hw_event.config != event_config) return 0; - if (counter->hw_event.exclude_user && user_mode(regs)) - return 0; + if (regs) { + if (counter->hw_event.exclude_user && user_mode(regs)) + return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) - return 0; + if (counter->hw_event.exclude_kernel && !user_mode(regs)) + return 0; + } return 1; } @@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) + if (counter->hw.irq_period && !neg && regs) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = { /* * Software counter: cpu migrations */ - -static inline u64 get_cpu_migrations(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->se.nr_migrations; - return cpu_nr_migrations(smp_processor_id()); -} - -static void cpu_migrations_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void cpu_migrations_perf_counter_read(struct perf_counter *counter) +void perf_counter_task_migration(struct task_struct *task, int cpu) { - cpu_migrations_perf_counter_update(counter); -} + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx; -static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_cpu_migrations(counter)); - return 0; -} + perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_CPU_MIGRATIONS, + 1, 1, NULL, 0); -static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) -{ - cpu_migrations_perf_counter_update(counter); + ctx = perf_pin_task_context(task); + if (ctx) { + perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_CPU_MIGRATIONS, + 1, 1, NULL, 0); + perf_unpin_context(ctx); + } } -static const struct pmu perf_ops_cpu_migrations = { - .enable = cpu_migrations_perf_counter_enable, - .disable = cpu_migrations_perf_counter_disable, - .read = cpu_migrations_perf_counter_read, -}; - #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { @@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - pmu = &perf_ops_generic; - break; case PERF_COUNT_CPU_MIGRATIONS: - if (!counter->hw_event.exclude_kernel) - pmu = &perf_ops_cpu_migrations; + pmu = &perf_ops_generic; break; } diff --git a/kernel/sched.c b/kernel/sched.c index 3226cc132e9..8d43347a0c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif + perf_counter_task_migration(p, new_cpu); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; -- cgit v1.2.3 From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:53:16 +1000 Subject: perf_counter: Remove unused prev_state field This removes the prev_state field of struct perf_counter since it is now unused. It was only used by the cpu migration counter, which doesn't use it any more. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index cd94cf3bf9e..fbed4d28ad7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; - group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter, struct perf_counter_context *ctx) { list_add_counter(counter, ctx); - counter->prev_state = PERF_COUNTER_STATE_OFF; counter->tstamp_enabled = ctx->time; counter->tstamp_running = ctx->time; counter->tstamp_stopped = ctx->time; @@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info) ctx->is_active = 1; update_context_time(ctx); - counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; -- cgit v1.2.3 From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 14:13:15 +0200 Subject: perf_counter: Use PID namespaces properly Stop using task_struct::pid and start using PID namespaces. PIDs will be reported in the PID namespace of the monitoring task at the moment of counter creation. Signed-off-by: Peter Zijlstra Cc: Eric W. Biederman Cc: Oleg Nesterov Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fbed4d28ad7..caa012cfe49 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1432,6 +1432,8 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + if (counter->ns) + put_pid_ns(counter->ns); kfree(counter); } @@ -2267,6 +2269,28 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_tgid_nr_ns(p, counter->ns); +} + +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_pid_nr_ns(p, counter->ns); +} + static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { @@ -2303,8 +2327,8 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TID) { /* namespace issues */ - tid_entry.pid = current->group_leader->pid; - tid_entry.tid = current->pid; + tid_entry.pid = perf_counter_pid(counter, current); + tid_entry.tid = perf_counter_tid(counter, current); header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); @@ -2432,6 +2456,9 @@ static void perf_counter_comm_output(struct perf_counter *counter, if (ret) return; + comm_event->event.pid = perf_counter_pid(counter, comm_event->task); + comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + perf_output_put(&handle, comm_event->event); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); @@ -2504,8 +2531,6 @@ void perf_counter_comm(struct task_struct *task) .task = task, .event = { .header = { .type = PERF_EVENT_COMM, }, - .pid = task->group_leader->pid, - .tid = task->pid, }, }; @@ -2542,6 +2567,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter, if (ret) return; + mmap_event->event.pid = perf_counter_pid(counter, current); + mmap_event->event.tid = perf_counter_tid(counter, current); + perf_output_put(&handle, mmap_event->event); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -2641,8 +2669,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, .file = file, .event = { .header = { .type = PERF_EVENT_MMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, .start = addr, .len = len, .pgoff = pgoff, @@ -2664,8 +2690,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, .file = file, .event = { .header = { .type = PERF_EVENT_MUNMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, .start = addr, .len = len, .pgoff = pgoff, @@ -3445,6 +3469,8 @@ SYSCALL_DEFINE5(perf_counter_open, list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); mutex_unlock(¤t->perf_counter_mutex); + counter->ns = get_pid_ns(current->nsproxy->pid_ns); + fput_light(counter_file, fput_needed2); out_fput: -- cgit v1.2.3 From 179c498ae2998461fe436437a74dc29036fc7dcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 12:03:19 -0400 Subject: function-graph: only allocate init tasks if it was not already done When the function graph tracer is enabled, it calls the initialization needed for the init tasks that would be called on all created tasks. The problem is that this is called every time the function graph tracer is enabled, and the ret_stack is allocated for the idle tasks each time. Thus, the old ret_stack is lost and a memory leak is created. This is also dangerous because if an interrupt happened on another CPU with the init task and the ret_stack is replaced, we then lose all the return pointers for the interrupt, and a crash would take place. [ Impact: fix memory leak and possible crash due to race ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ed080406c..ebff62ef40b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2643,8 +2643,10 @@ static int start_graph_tracing(void) return -ENOMEM; /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) - ftrace_graph_init_task(idle_task(cpu)); + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_task(idle_task(cpu)); + } do { ret = alloc_retstack_tasklist(ret_stack_list); -- cgit v1.2.3 From 82310a3272d5a2a7652f5649ad8a55f58c8f74d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 12:26:07 -0400 Subject: function-graph: enable the stack after initialization of other variables The function graph tracer checks if the task_struct has ret_stack defined to know if it is OK or not to use it. The initialization is done for all tasks by one process, but the idle tasks use the same initialization used by new tasks. If an interrupt happens on an idle task that just had the ret_stack created, but before the rest of the initialization took place, then we can corrupt the return address of the functions. This patch moves the setting of the task_struct's ret_stack to after the other variables have been initialized. [ Impact: prevent kernel panic on idle task when starting function graph ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- kernel/trace/trace_functions_graph.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ebff62ef40b..20e066065eb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2739,15 +2739,20 @@ void unregister_ftrace_graph(void) void ftrace_graph_init_task(struct task_struct *t) { if (atomic_read(&ftrace_graph_active)) { - t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); - if (!t->ret_stack) + if (!ret_stack) return; t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; } else t->ret_stack = NULL; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..baeb5fe3610 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) if (!current->ret_stack) return -EBUSY; + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + /* The return trace stack is full */ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { atomic_inc(¤t->trace_overrun); -- cgit v1.2.3 From 26c01624a2a40f8a4ddf6449b65c9b1c418d0e72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 14:01:19 -0400 Subject: function-graph: add memory barriers for accessing task's ret_stack The code that handles the tasks ret_stack allocation for every task assumes that only an interrupt can cause issues (even though interrupts are disabled). In reality, the code is allocating the ret_stack for tasks that may be running on other CPUs and there are not efficient memory barriers to handle this case. [ Impact: prevent crash due to using of uninitialized ret_stack variables ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 20e066065eb..1664d3f33d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2580,12 +2580,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - t->curr_ret_stack = -1; - /* Make sure IRQs see the -1 first: */ - barrier(); - t->ret_stack = ret_stack_list[start++]; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; } } while_each_thread(g, t); -- cgit v1.2.3 From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:08:15 +0200 Subject: perf_counter: Add unique counter id Stephan raised the issue that we currently cannot distinguish between similar counters within a group (PERF_RECORD_GROUP uses the config value as identifier). Therefore, generate a new ID for each counter using a global u64 sequence counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index caa012cfe49..978ecfcc7aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1510,6 +1510,8 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); + if (counter->hw_event.read_format & PERF_FORMAT_ID) + values[n++] = counter->id; mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -2303,7 +2305,7 @@ static void perf_counter_output(struct perf_counter *counter, u32 pid, tid; } tid_entry; struct { - u64 event; + u64 id; u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; @@ -2416,7 +2418,7 @@ static void perf_counter_output(struct perf_counter *counter, if (sub != counter) sub->pmu->read(sub); - group_entry.event = sub->hw_event.config; + group_entry.id = sub->id; group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -3375,6 +3377,8 @@ done: return counter; } +static atomic64_t perf_counter_id; + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -3470,6 +3474,7 @@ SYSCALL_DEFINE5(perf_counter_open, mutex_unlock(¤t->perf_counter_mutex); counter->ns = get_pid_ns(current->nsproxy->pid_ns); + counter->id = atomic64_inc_return(&perf_counter_id); fput_light(counter_file, fput_needed2); -- cgit v1.2.3 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 104 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 978ecfcc7aa..5ecd9981c03 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 interrupts, irq_period; + u64 interrupts, sample_period; u64 events, period; s64 delta; @@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = 2*sysctl_perf_counter_limit/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + if (!counter->hw_event.freq || !counter->hw_event.sample_freq) continue; - events = HZ * interrupts * counter->hw.irq_period; - period = div64_u64(events, counter->hw_event.irq_freq); + events = HZ * interrupts * counter->hw.sample_period; + period = div64_u64(events, counter->hw_event.sample_freq); - delta = (s64)(1 + period - counter->hw.irq_period); + delta = (s64)(1 + period - counter->hw.sample_period); delta >>= 1; - irq_period = counter->hw.irq_period + delta; + sample_period = counter->hw.sample_period + delta; - if (!irq_period) - irq_period = 1; + if (!sample_period) + sample_period = 1; - perf_log_period(counter, irq_period); + perf_log_period(counter, sample_period); - counter->hw.irq_period = irq_period; + counter->hw.sample_period = sample_period; } spin_unlock(&ctx->lock); } @@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int ret; - u64 record_type = counter->hw_event.record_type; + u64 sample_type = counter->hw_event.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; @@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter, header.misc = PERF_EVENT_MISC_OVERFLOW; header.misc |= perf_misc_flags(regs); - if (record_type & PERF_RECORD_IP) { + if (sample_type & PERF_SAMPLE_IP) { ip = perf_instruction_pointer(regs); - header.type |= PERF_RECORD_IP; + header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } - if (record_type & PERF_RECORD_TID) { + if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ tid_entry.pid = perf_counter_pid(counter, current); tid_entry.tid = perf_counter_tid(counter, current); - header.type |= PERF_RECORD_TID; + header.type |= PERF_SAMPLE_TID; header.size += sizeof(tid_entry); } - if (record_type & PERF_RECORD_TIME) { + if (sample_type & PERF_SAMPLE_TIME) { /* * Maybe do better on x86 and provide cpu_clock_nmi() */ time = sched_clock(); - header.type |= PERF_RECORD_TIME; + header.type |= PERF_SAMPLE_TIME; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_ADDR) { - header.type |= PERF_RECORD_ADDR; + if (sample_type & PERF_SAMPLE_ADDR) { + header.type |= PERF_SAMPLE_ADDR; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_CONFIG) { - header.type |= PERF_RECORD_CONFIG; + if (sample_type & PERF_SAMPLE_CONFIG) { + header.type |= PERF_SAMPLE_CONFIG; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_CPU) { - header.type |= PERF_RECORD_CPU; + if (sample_type & PERF_SAMPLE_CPU) { + header.type |= PERF_SAMPLE_CPU; header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); } - if (record_type & PERF_RECORD_GROUP) { - header.type |= PERF_RECORD_GROUP; + if (sample_type & PERF_SAMPLE_GROUP) { + header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } - if (record_type & PERF_RECORD_CALLCHAIN) { + if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(regs); if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - header.type |= PERF_RECORD_CALLCHAIN; + header.type |= PERF_SAMPLE_CALLCHAIN; header.size += callchain_size; } } @@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, header); - if (record_type & PERF_RECORD_IP) + if (sample_type & PERF_SAMPLE_IP) perf_output_put(&handle, ip); - if (record_type & PERF_RECORD_TID) + if (sample_type & PERF_SAMPLE_TID) perf_output_put(&handle, tid_entry); - if (record_type & PERF_RECORD_TIME) + if (sample_type & PERF_SAMPLE_TIME) perf_output_put(&handle, time); - if (record_type & PERF_RECORD_ADDR) + if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, addr); - if (record_type & PERF_RECORD_CONFIG) + if (sample_type & PERF_SAMPLE_CONFIG) perf_output_put(&handle, counter->hw_event.config); - if (record_type & PERF_RECORD_CPU) + if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); /* - * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. + * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. */ - if (record_type & PERF_RECORD_GROUP) { + if (sample_type & PERF_SAMPLE_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * Log irq_period changes so that analyzing tools can re-normalize the + * Log sample_period changes so that analyzing tools can re-normalize the * event flow. */ @@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) .period = period, }; - if (counter->hw.irq_period == period) + if (counter->hw.sample_period == period) return; ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); @@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = hwc->sample_period; if (unlikely(left <= -period)) { left = period; @@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ret = HRTIMER_NORESTART; } - period = max_t(u64, 10000, counter->hw.irq_period); + period = max_t(u64, 10000, counter->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; @@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg && regs) + if (counter->hw.sample_period && !neg && regs) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, cpu_clock(cpu)); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); task_clock_perf_counter_update(counter, counter->ctx->time); @@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.irq_period = counter->hw_event.irq_period; + counter->hw.sample_period = counter->hw_event.sample_period; return &perf_ops_generic; } @@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->irq_freq) - hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); + if (hw_event->freq && hw_event->sample_freq) + hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq); else - hwc->irq_period = hw_event->irq_period; + hwc->sample_period = hw_event->sample_period; /* - * we currently do not support PERF_RECORD_GROUP on inherited counters + * we currently do not support PERF_SAMPLE_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) + if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP)) goto done; if (perf_event_raw(hw_event)) { -- cgit v1.2.3 From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:16:02 +0200 Subject: perf_counter: Change data head from u32 to u64 Since some people worried that 4G might not be a large enough as an mmap data window, extend it to 64 bit for capable platforms. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5ecd9981c03..3f11a2bc6c7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2067,8 +2067,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_output_handle { struct perf_counter *counter; struct perf_mmap_data *data; - unsigned int offset; - unsigned int head; + unsigned long head; + unsigned long offset; int nmi; int overflow; int locked; @@ -2122,7 +2122,8 @@ static void perf_output_lock(struct perf_output_handle *handle) static void perf_output_unlock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int head, cpu; + unsigned long head; + int cpu; data->done_head = data->head; @@ -2135,7 +2136,7 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->done_head, 0))) + while ((head = atomic_long_xchg(&data->done_head, 0))) data->user_page->data_head = head; /* @@ -2148,7 +2149,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->done_head))) { + if (unlikely(atomic_long_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -2195,7 +2196,7 @@ static int perf_output_begin(struct perf_output_handle *handle, do { offset = head = atomic_read(&data->head); head += size; - } while (atomic_cmpxchg(&data->head, offset, head) != offset); + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; handle->head = head; @@ -2246,7 +2247,7 @@ static void perf_output_copy(struct perf_output_handle *handle, * Check we didn't copy past our reservation window, taking the * possible unsigned int wrap into account. */ - WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } #define perf_output_put(handle, x) \ -- cgit v1.2.3 From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:46:57 +0200 Subject: perf_counter: Add ioctl for changing the sample period/frequency Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f11a2bc6c7..abe2f3b6c42 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1604,6 +1604,43 @@ static void perf_counter_for_each(struct perf_counter *counter, mutex_unlock(&counter->child_mutex); } +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ + struct perf_counter_context *ctx = counter->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!counter->hw_event.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (counter->hw_event.freq) { + if (value > sysctl_perf_counter_limit) { + ret = -EINVAL; + goto unlock; + } + + counter->hw_event.sample_freq = value; + } else { + counter->hw_event.sample_period = value; + counter->hw.sample_period = value; + + perf_log_period(counter, value); + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1623,6 +1660,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: return perf_counter_refresh(counter, arg); + + case PERF_COUNTER_IOC_PERIOD: + return perf_counter_period(counter, (u64 __user *)arg); + default: return -ENOTTY; } -- cgit v1.2.3 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 116 +++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index abe2f3b6c42..317cef78a38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter, if (!is_software_counter(counter)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + if (counter->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter, list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); - if (group_counter->hw_event.exclusive) + if (group_counter->attr.exclusive) cpuctx->exclusive = 0; } @@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter, cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->hw_event.exclusive) + if (counter->attr.exclusive) cpuctx->exclusive = 1; return 0; @@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter, * If this group is exclusive and there are already * counters on the CPU, it can't go on. */ - if (counter->hw_event.exclusive && cpuctx->active_oncpu) + if (counter->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) /* * not supported on inherited counters */ - if (counter->hw_event.inherit) + if (counter->attr.inherit) return -EINVAL; atomic_add(refresh, &counter->event_limit); @@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->hw_event.pinned) + !counter->attr.pinned) continue; if (counter->cpu != -1 && counter->cpu != cpu) continue; @@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * ignore pinned counters since we did them already. */ if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->hw_event.pinned) + counter->attr.pinned) continue; /* @@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = 2*sysctl_perf_counter_limit/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.sample_freq) + if (!counter->attr.freq || !counter->attr.sample_freq) continue; events = HZ * interrupts * counter->hw.sample_period; - period = div64_u64(events, counter->hw_event.sample_freq); + period = div64_u64(events, counter->attr.sample_freq); delta = (s64)(1 + period - counter->hw.sample_period); delta >>= 1; @@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter) perf_pending_sync(counter); atomic_dec(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_dec(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_dec(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_dec(&nr_comm_tracking); if (counter->destroy) @@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + atomic64_read(&counter->child_total_time_enabled); - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - if (counter->hw_event.read_format & PERF_FORMAT_ID) + if (counter->attr.read_format & PERF_FORMAT_ID) values[n++] = counter->id; mutex_unlock(&counter->child_mutex); @@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) int ret = 0; u64 value; - if (!counter->hw_event.sample_period) + if (!counter->attr.sample_period) return -EINVAL; size = copy_from_user(&value, arg, sizeof(value)); @@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) return -EINVAL; spin_lock_irq(&ctx->lock); - if (counter->hw_event.freq) { + if (counter->attr.freq) { if (value > sysctl_perf_counter_limit) { ret = -EINVAL; goto unlock; } - counter->hw_event.sample_freq = value; + counter->attr.sample_freq = value; } else { - counter->hw_event.sample_period = value; + counter->attr.sample_period = value; counter->hw.sample_period = value; perf_log_period(counter, value); @@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle) struct perf_counter *counter = handle->counter; struct perf_mmap_data *data = handle->data; - int wakeup_events = counter->hw_event.wakeup_events; + int wakeup_events = counter->attr.wakeup_events; if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&data->events); @@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int ret; - u64 sample_type = counter->hw_event.sample_type; + u64 sample_type = counter->attr.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; @@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, addr); if (sample_type & PERF_SAMPLE_CONFIG) - perf_output_put(&handle, counter->hw_event.config); + perf_output_put(&handle, counter->attr.config); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); @@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter, static int perf_counter_comm_match(struct perf_counter *counter, struct perf_comm_event *comm_event) { - if (counter->hw_event.comm && + if (counter->attr.comm && comm_event->event.header.type == PERF_EVENT_COMM) return 1; @@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->hw_event.mmap && + if (counter->attr.mmap && mmap_event->event.header.type == PERF_EVENT_MMAP) return 1; - if (counter->hw_event.munmap && + if (counter->attr.munmap && mmap_event->event.header.type == PERF_EVENT_MUNMAP) return 1; @@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->hw_event.exclude_kernel || !regs) && - !counter->hw_event.exclude_user) + if ((counter->attr.exclude_kernel || !regs) && + !counter->attr.exclude_user) regs = task_pt_regs(current); if (regs) { @@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter, if (!perf_swcounter_is_counting(counter)) return 0; - if (counter->hw_event.config != event_config) + if (counter->attr.config != event_config) return 0; if (regs) { - if (counter->hw_event.exclude_user && user_mode(regs)) + if (counter->attr.exclude_user && user_mode(regs)) return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) + if (counter->attr.exclude_kernel && !user_mode(regs)) return 0; } @@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->hw_event)); + ftrace_profile_disable(perf_event_id(&counter->attr)); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->hw_event); + int event_id = perf_event_id(&counter->attr); int ret; ret = ftrace_profile_enable(event_id); @@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.sample_period = counter->hw_event.sample_period; + counter->hw.sample_period = counter->attr.sample_period; return &perf_ops_generic; } @@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->hw_event)) { + switch (perf_event_id(&counter->attr)) { case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, +perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, @@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); counter->cpu = cpu; - counter->hw_event = *hw_event; + counter->attr = *attr; counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; counter->oncpu = -1; counter->state = PERF_COUNTER_STATE_INACTIVE; - if (hw_event->disabled) + if (attr->disabled) counter->state = PERF_COUNTER_STATE_OFF; pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->sample_freq) - hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq); + if (attr->freq && attr->sample_freq) + hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq); else - hwc->sample_period = hw_event->sample_period; + hwc->sample_period = attr->sample_period; /* * we currently do not support PERF_SAMPLE_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (perf_event_raw(hw_event)) { + if (perf_event_raw(attr)) { pmu = hw_perf_counter_init(counter); goto done; } - switch (perf_event_type(hw_event)) { + switch (perf_event_type(attr)) { case PERF_TYPE_HARDWARE: pmu = hw_perf_counter_init(counter); break; @@ -3409,11 +3409,11 @@ done: counter->pmu = pmu; atomic_inc(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_inc(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_inc(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_inc(&nr_comm_tracking); return counter; @@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id; /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * - * @hw_event_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_hw_event __user *, hw_event_uptr, + const struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct perf_counter_context *ctx; struct file *counter_file = NULL; struct file *group_file = NULL; @@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) + if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) return -EFAULT; /* @@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open, /* * Only a group leader can be exclusive or pinned */ - if (hw_event.exclusive || hw_event.pinned) + if (attr.exclusive || attr.pinned) goto err_put_context; } - counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) @@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter, if (parent_counter->parent) parent_counter = parent_counter->parent; - child_counter = perf_counter_alloc(&parent_counter->hw_event, + child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) @@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, + * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) @@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * inherit into child's child as well: */ - child_counter->hw_event.inherit = 1; + child_counter->attr.inherit = 1; /* * Get a reference to the parent filp - we will fput it @@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) { + if (!counter->attr.inherit) { inherited_all = 0; continue; } -- cgit v1.2.3 From f7e8b616ed1cc6f790b82324bce8a2a60295e5c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 16:39:48 -0400 Subject: function-graph: move initialization of new tasks up in fork When the function graph tracer is enabled, all new tasks must allocate a ret_stack to place the return address of functions. This is because the function graph tracer will replace the real return address with a call to the tracing of the exit function. This initialization happens in fork, but it happens too late. If fork fails, then it will call free_task and that calls the freeing of this ret_stack. But before initialization happens, the new (failed) task points to its parents ret_stack. If a fork failure happens during the function trace, it would be catastrophic for the parent. Also, there's no need to call ftrace_graph_exit_task from fork, since it is called by free_task which fork calls on failure. [ Impact: prevent crash during failed fork running function graph tracer ] Signed-off-by: Steven Rostedt --- kernel/fork.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..c4b1e35c430 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + ftrace_graph_init_task(p); + rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING @@ -1131,8 +1133,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - ftrace_graph_init_task(p); - p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1141,7 +1141,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_graph; + goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_graph; + goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { @@ -1268,8 +1268,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; -bad_fork_free_graph: - ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); -- cgit v1.2.3 From 84047e360af0394ac5861d433f26bbcf30f77dd1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 16:51:55 -0400 Subject: function-graph: always initialize task ret_stack On creating a new task while running the function graph tracer, if we fail to allocate the ret_stack, and then fail the fork, the code will free the parent ret_stack. This is because the child duplicated the parent and currently points to the parent's ret_stack. This patch always initializes the task's ret_stack to NULL. [ Impact: prevent crash of parent on low memory during fork ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1664d3f33d3..bb081f37cac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2738,6 +2738,9 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + if (atomic_read(&ftrace_graph_active)) { struct ftrace_ret_stack *ret_stack; @@ -2753,8 +2756,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* make curr_ret_stack visable before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; - } else - t->ret_stack = NULL; + } } void ftrace_graph_exit_task(struct task_struct *t) -- cgit v1.2.3 From 226f62fdd53d5b2c74e242aa11f6ad43d0285d3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 11:23:56 +0200 Subject: perf_counter: Add a comm hook for pure fork()s I noticed missing COMM events and found that we missed reporting them for pure forks. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 23bf757ed32..b7d7a9f0bd7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1412,6 +1412,12 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + } else { + /* + * vfork will do an exec which will call + * set_task_comm() + */ + perf_counter_comm(p); } audit_finish_fork(p); -- cgit v1.2.3 From a96bbc16418bc691317f265d6bf98ba941ca9c1a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 14:01:36 +0200 Subject: perf_counter: Fix race in counter initialization We need the PID namespace and counter ID available when the counter overflows and we need to generate a sample event. [ Impact: fix kernel crash with high-frequency sampling ] Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: [ fixed a further crash and cleaned up the initialization a bit ] Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 317cef78a38..ab4455447f8 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -48,6 +48,8 @@ int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ +static atomic64_t perf_counter_id; + /* * Lock for (sysadmin-configurable) counter reservations: */ @@ -3351,14 +3353,18 @@ perf_counter_alloc(struct perf_counter_attr *attr, mutex_init(&counter->mmap_mutex); - counter->cpu = cpu; + counter->cpu = cpu; counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; + counter->group_leader = group_leader; + counter->pmu = NULL; + counter->ctx = ctx; + counter->oncpu = -1; + + counter->ns = get_pid_ns(current->nsproxy->pid_ns); + counter->id = atomic64_inc_return(&perf_counter_id); + + counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->state = PERF_COUNTER_STATE_INACTIVE; if (attr->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -3402,6 +3408,8 @@ done: err = PTR_ERR(pmu); if (err) { + if (counter->ns) + put_pid_ns(counter->ns); kfree(counter); return ERR_PTR(err); } @@ -3419,8 +3427,6 @@ done: return counter; } -static atomic64_t perf_counter_id; - /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -3515,9 +3521,6 @@ SYSCALL_DEFINE5(perf_counter_open, list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); mutex_unlock(¤t->perf_counter_mutex); - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - fput_light(counter_file, fput_needed2); out_fput: -- cgit v1.2.3 From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 2 Jun 2009 17:06:54 -0700 Subject: ring-buffer: fix bug in ring_buffer_discard_commit There's a bug in ring_buffer_discard_commit. The wrong pointer is being compared in order to check if the event can be freed from the buffer rather than discarded (i.e. marked as PAD). I noticed this when I was working on duration filtering. The bug is not deadly - it just results in lots of wasted space in the buffer. All filtered events are left in the buffer and marked as discarded, rather than being removed from the buffer to make space for other events. Unfortunately, when I fixed this bug, I got errors doing a filtered function trace. Multiple TIME_EXTEND events pile up in the buffer, and trigger the following loop overage warning in rb_iter_peek(): again: ... if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; I'm not sure what the best way is to fix this. I don't know if I should extend the loop threshhold, or if I should make the test more complex (ignore TIME_EXTEND events), or just get rid of this loop check completely. Note that if I implement a workaround for this, then I see another problem from rb_advance_iter(). I haven't tracked that one down yet. In general, it seems like the case of removing filtered events has not been working properly, and so some assumptions about buffer invariant conditions need to be revisited. Here's the patch for the simple fix: Compare correct pointer for checking if an event can be freed rather than left as discarded in the buffer. Signed-off-by: Tim Bird LKML-Reference: <4A25BE9E.5090909@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b24d49604..94530236869 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, bpage = cpu_buffer->tail_page; - if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { /* * This is on the tail page. It is possible that * a write could come in and move the tail page -- cgit v1.2.3 From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 23:00:53 -0400 Subject: ring-buffer: try to discard unneeded timestamps There are times that a race may happen that we add a timestamp in a nested write. This timestamp would just contain a zero delta and serves no purpose. Now that we have a way to discard events, this patch will try to discard the timestamp instead of just wasting the space in the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 67 ++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 94530236869..50926601a28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) @@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - ret = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } } *delta = 0; @@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; int cpu; /* The event is discarded regardless */ @@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) - goto out; - } + if (!rb_try_to_discard(cpu_buffer, event)) + goto out; /* * The commit is still visible by the reader, so we -- cgit v1.2.3 From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 09:30:10 -0400 Subject: ring-buffer: discard timestamps that are at the start of the buffer Every buffer page in the ring buffer includes its own time stamp. When an event is recorded to the ring buffer with a delta time greater than what can be held in the event header, a time stamp event is created. If the the create timestamp falls over to the next buffer page, it is redundant because the buffer page holds a full time stamp. This patch will try to discard the time stamp when it falls to the start of the next page. This change also fixes a issues with disarding events. If most events are discarded, timestamps will start to creep into the ring buffer. If we do not discard the timestamps then they can fill up the ring buffer over time and waste space. This change will keep time stamps from filling up over another page. If something is recorded in the buffer page, and the rest is filtered, then the time stamps can only fill up to the end of the page. [ Impact: prevent time stamps from filling ring buffer ] Reported-by: Tim Bird Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50926601a28..7102d7a2fad 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = *delta >> TS_SHIFT; } else { cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ @@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.2.3 From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:28 +0800 Subject: tracing/trace_stack: fix the number of entries in the header The last entry in the stack_dump_trace is ULONG_MAX, which is not a valid entry, but max_stack_trace.nr_entries has accounted for it. So when printing the header, we should decrease it by one. Before fix, print as following, for example: Depth Size Location (53 entries) <--- should be 52 ----- ---- -------- 0) 3264 108 update_wall_time+0x4d5/0x9a0 ... 51) 80 80 syscall_call+0x7/0xb ^^^ it's correct. Signed-off-by: walimis LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1796f00524e..2d7aebd71db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + max_stack_trace.nr_entries - 1); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); -- cgit v1.2.3 From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:29 +0800 Subject: tracing/events: fix output format of kernel stack According to "events/ftrace/kernel_stack/format", output format of kernel stack should use "=>" instead of "<=". The second problem is that we shouldn't skip the first entry in the stack, although it seems to be duplicated when used in the "function" tracer, but events also use it. If we skip the first one, we will drop the topmost entry of the stack. The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should drop it, otherwise it will print a NULL name line. before fix: sh-1072 [000] 26.957239: sched_process_fork: parent sh:1072 child sh:1073 sh-1072 [000] 26.957262: <= syscall_call <= sh-1072 [000] 26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120] sh-1072 [000] 26.957752: <= preempt_schedule <= wake_up_new_task <= do_fork <= sys_clone <= syscall_call <= After fix: sh-1075 [000] 39.791848: sched_process_fork: parent sh:1075 child sh:1076 sh-1075 [000] 39.791871: => sys_clone => syscall_call sh-1075 [000] 39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120] sh-1075 [000] 39.792722: => schedule => preempt_schedule => wake_up_new_task => do_fork => sys_clone => syscall_call Signed-off-by: walimis LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0fe3b223f7e..64596a57160 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); + if (!trace_seq_puts(s, "\n")) + goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i]) + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) break; - if (i) { - if (!trace_seq_puts(s, " <= ")) - goto partial; + if (!trace_seq_puts(s, " => ")) + goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) - goto partial; - } + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; if (!trace_seq_puts(s, "\n")) goto partial; } -- cgit v1.2.3 From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:30 +0800 Subject: tracing/events: fix output format of user stack According to "events/ftrace/user_stack/format", fix the output of user stack. before fix: sh-1073 [000] 31.137561: <- <0804e33c> <- <080835c1> after fix: sh-1072 [000] 37.039329: => => <0804e33c> => <080835c1> Signed-off-by: walimis LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64596a57160..8dadbbbd2d5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ip == ULONG_MAX || !ret) break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); + if (ret) + ret = trace_seq_puts(s, " => "); if (!ip) { if (ret) ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); } if (mm) @@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_userip_objs(field, s, flags)) + if (!trace_seq_putc(s, '\n')) goto partial; - if (!trace_seq_putc(s, '\n')) + if (!seq_print_userip_objs(field, s, flags)) goto partial; return TRACE_TYPE_HANDLED; -- cgit v1.2.3 From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Jun 2009 14:52:03 +0100 Subject: tracing: fix multiple use of __print_flags and __print_symbolic Here is an updated patch to include the extra call to trace_seq_init() as requested. This is vs. the latest -tip tree and fixes the use of multiple __print_flags and __print_symbolic in a single tracer. Also tested to ensure its working now: mount.gfs2-2534 [000] 235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR mount.gfs2-2534 [000] 235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI mount.gfs2-2534 [000] 235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX glock_workqueue-2529 [000] 235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI glock_workqueue-2529 [000] 235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I Signed-off-by: Steven Whitehouse LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8dadbbbd2d5..8afeea412e7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; + const char *ret = p->buffer + p->len; int i; - trace_seq_init(p); - for (i = 0; flag_array[i].name && flags; i++) { mask = flag_array[i].mask; @@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_flags_seq); @@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - - trace_seq_init(p); + const char *ret = p->buffer + p->len; for (i = 0; symbol_array[i].name; i++) { @@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_symbols_seq); -- cgit v1.2.3 From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 11:10:44 -0400 Subject: tracing: add annotation to what type of stack trace is recorded The current method of printing out a stack trace is to add a new line and print out the trace: yum-updatesd-3120 [002] 573.691303: => do_softirq => irq_exit => smp_apic_timer_interrupt => apic_timer_interrupt This looks a bit awkward, and if we have both stack and user stack traces running, it would be nice to have a title to tell them apart, although it is easy to tell by the output. This patch adds an annotation to the start of the stack traces: init-1 [003] 929.304979: => user_path_at => vfs_fstatat => vfs_stat => sys_newstat => system_call_fastpath cat-3459 [002] 1016.824040: => <0000003aae6c0250> => <00007ffff4b06ae4> => <69636172742f6775> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8afeea412e7..425725c1622 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_puts(s, "\n")) goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) @@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_putc(s, '\n')) + if (!trace_seq_puts(s, "\n")) goto partial; if (!seq_print_userip_objs(field, s, flags)) -- cgit v1.2.3 From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2009 22:19:36 +0200 Subject: perf_counter: Fix throttling lock-up Throttling logic is broken and we can lock up with too small hw sampling intervals. Make the throttling code more robust: disable counters even if we already disabled them. ( Also clean up whitespace damage i noticed while reading various pieces of code related to throttling. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ab4455447f8..0bb03f15a5b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2822,11 +2822,20 @@ int perf_counter_overflow(struct perf_counter *counter, if (!throttle) { counter->hw.interrupts++; - } else if (counter->hw.interrupts != MAX_INTERRUPTS) { - counter->hw.interrupts++; - if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { - counter->hw.interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); + } else { + if (counter->hw.interrupts != MAX_INTERRUPTS) { + counter->hw.interrupts++; + if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { + counter->hw.interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling counters even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the counter got enabled again: + */ ret = 1; } } -- cgit v1.2.3 From e0a94c2a63f2644826069044649669b5e7ca75d3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 3 Jun 2009 16:04:31 -0400 Subject: security: use mmap_min_addr indepedently of security models This patch removes the dependency of mmap_min_addr on CONFIG_SECURITY. It also sets a default mmap_min_addr of 4096. mmapping of addresses below 4096 will only be possible for processes with CAP_SYS_RAWIO. Signed-off-by: Christoph Lameter Acked-by: Eric Paris Looks-ok-by: Linus Torvalds Signed-off-by: James Morris --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 149581fb48a..45bd711a242 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1237,7 +1237,6 @@ static struct ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif -#ifdef CONFIG_SECURITY { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", @@ -1246,7 +1245,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#endif #ifdef CONFIG_NUMA { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3 From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 16:53:44 +0200 Subject: perf_counter: Add fork event Create a fork event so that we can easily clone the comm and dso maps without having to generate all those events. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 +- kernel/perf_counter.c | 131 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 116 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b7d7a9f0bd7..f4466ca37ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1412,12 +1412,12 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else { + } else if (!(clone_flags & CLONE_VM)) { /* * vfork will do an exec which will call * set_task_comm() */ - perf_counter_comm(p); + perf_counter_fork(p); } audit_finish_fork(p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0bb03f15a5b..78c58623a0d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -40,9 +40,9 @@ static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_tracking __read_mostly; -static atomic_t nr_munmap_tracking __read_mostly; -static atomic_t nr_comm_tracking __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_munmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ @@ -1447,11 +1447,11 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_counters); if (counter->attr.mmap) - atomic_dec(&nr_mmap_tracking); + atomic_dec(&nr_mmap_counters); if (counter->attr.munmap) - atomic_dec(&nr_munmap_tracking); + atomic_dec(&nr_munmap_counters); if (counter->attr.comm) - atomic_dec(&nr_comm_tracking); + atomic_dec(&nr_comm_counters); if (counter->destroy) counter->destroy(counter); @@ -2475,6 +2475,105 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_end(&handle); } +/* + * fork tracking + */ + +struct perf_fork_event { + struct task_struct *task; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + } event; +}; + +static void perf_counter_fork_output(struct perf_counter *counter, + struct perf_fork_event *fork_event) +{ + struct perf_output_handle handle; + int size = fork_event->event.header.size; + struct task_struct *task = fork_event->task; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + fork_event->event.pid = perf_counter_pid(counter, task); + fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + + perf_output_put(&handle, fork_event->event); + perf_output_end(&handle); +} + +static int perf_counter_fork_match(struct perf_counter *counter) +{ + if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap) + return 1; + + return 0; +} + +static void perf_counter_fork_ctx(struct perf_counter_context *ctx, + struct perf_fork_event *fork_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_fork_match(counter)) + perf_counter_fork_output(counter, fork_event); + } + rcu_read_unlock(); +} + +static void perf_counter_fork_event(struct perf_fork_event *fork_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_fork_ctx(ctx, fork_event); + rcu_read_unlock(); +} + +void perf_counter_fork(struct task_struct *task) +{ + struct perf_fork_event fork_event; + + if (!atomic_read(&nr_comm_counters) && + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_munmap_counters)) + return; + + fork_event = (struct perf_fork_event){ + .task = task, + .event = { + .header = { + .type = PERF_EVENT_FORK, + .size = sizeof(fork_event.event), + }, + }, + }; + + perf_counter_fork_event(&fork_event); +} + /* * comm tracking */ @@ -2511,11 +2610,9 @@ static void perf_counter_comm_output(struct perf_counter *counter, perf_output_end(&handle); } -static int perf_counter_comm_match(struct perf_counter *counter, - struct perf_comm_event *comm_event) +static int perf_counter_comm_match(struct perf_counter *counter) { - if (counter->attr.comm && - comm_event->event.header.type == PERF_EVENT_COMM) + if (counter->attr.comm) return 1; return 0; @@ -2531,7 +2628,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter, comm_event)) + if (perf_counter_comm_match(counter)) perf_counter_comm_output(counter, comm_event); } rcu_read_unlock(); @@ -2570,7 +2667,7 @@ void perf_counter_comm(struct task_struct *task) { struct perf_comm_event comm_event; - if (!atomic_read(&nr_comm_tracking)) + if (!atomic_read(&nr_comm_counters)) return; comm_event = (struct perf_comm_event){ @@ -2708,7 +2805,7 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_mmap_tracking)) + if (!atomic_read(&nr_mmap_counters)) return; mmap_event = (struct perf_mmap_event){ @@ -2729,7 +2826,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_munmap_tracking)) + if (!atomic_read(&nr_munmap_counters)) return; mmap_event = (struct perf_mmap_event){ @@ -3427,11 +3524,11 @@ done: atomic_inc(&nr_counters); if (counter->attr.mmap) - atomic_inc(&nr_mmap_tracking); + atomic_inc(&nr_mmap_counters); if (counter->attr.munmap) - atomic_inc(&nr_munmap_tracking); + atomic_inc(&nr_munmap_counters); if (counter->attr.comm) - atomic_inc(&nr_comm_tracking); + atomic_inc(&nr_comm_counters); return counter; } -- cgit v1.2.3 From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 17:08:58 +0200 Subject: perf_counter: Remove munmap stuff In name of keeping it simple, only track mmap events. Userspace will have to remove old overlapping maps when it encounters them. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 78c58623a0d..195712e20d0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -41,7 +41,6 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_munmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ @@ -1448,8 +1447,6 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_counters); if (counter->attr.mmap) atomic_dec(&nr_mmap_counters); - if (counter->attr.munmap) - atomic_dec(&nr_munmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); @@ -2510,7 +2507,7 @@ static void perf_counter_fork_output(struct perf_counter *counter, static int perf_counter_fork_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap) + if (counter->attr.comm || counter->attr.mmap) return 1; return 0; @@ -2557,8 +2554,7 @@ void perf_counter_fork(struct task_struct *task) struct perf_fork_event fork_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_munmap_counters)) + !atomic_read(&nr_mmap_counters)) return; fork_event = (struct perf_fork_event){ @@ -2722,12 +2718,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->attr.mmap && - mmap_event->event.header.type == PERF_EVENT_MMAP) - return 1; - - if (counter->attr.munmap && - mmap_event->event.header.type == PERF_EVENT_MUNMAP) + if (counter->attr.mmap) return 1; return 0; @@ -2821,27 +2812,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } -void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_munmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .file = file, - .event = { - .header = { .type = PERF_EVENT_MUNMAP, }, - .start = addr, - .len = len, - .pgoff = pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - /* * Log sample_period changes so that analyzing tools can re-normalize the * event flow. @@ -3525,8 +3495,6 @@ done: atomic_inc(&nr_counters); if (counter->attr.mmap) atomic_inc(&nr_mmap_counters); - if (counter->attr.munmap) - atomic_inc(&nr_munmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); -- cgit v1.2.3 From 087eb437051b3de817720f9c80c440fc9e7dcce8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2009 16:29:07 -0700 Subject: ptrace: tracehook_report_clone: fix false positives The "trace || CLONE_PTRACE" check in tracehook_report_clone() is not right, - If the untraced task does clone(CLONE_PTRACE) the new child is not traced, we must not queue SIGSTOP. - If we forked the traced task, but the tracer exits and untraces both the forking task and the new child (after copy_process() drops tasklist_lock), we should not queue SIGSTOP too. Change the code to check task_ptrace() != 0 instead. This is still racy, but the race is harmless. We can race with another tracer attaching to this child, or the tracer can exit and detach in parallel. But giwen that we didn't do wake_up_new_task() yet, the child must have the pending SIGSTOP anyway. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Christoph Hellwig Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..875ffbdd96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(trace, regs, clone_flags, nr, p); + tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to -- cgit v1.2.3 From edaba2c5334492f82d39ec35637c6dea5176a977 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jun 2009 16:29:09 -0700 Subject: ptrace: revert "ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic" Commit 95a3540da9c81a5987be810e1d9a83640a366bd5 ("ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic") removed the "extra" wake_up_process() from ptrace_detach(), but as Jan pointed out this breaks the compatibility. I believe the changelog is right and this wake_up() is wrong in many ways, but GDB assumes that ptrace(PTRACE_DETACH, child, 0, 0) always wakes up the tracee. Despite the fact this breaks SIGNAL_STOP_STOPPED/group_stop_count logic, and despite the fact this wake_up_process() can break another assumption: PTRACE_DETACH with SIGSTOP should leave the tracee in TASK_STOPPED case. Because the untraced child can dequeue SIGSTOP and call do_signal_stop() before ptrace_detach() calls wake_up_process(). Revert this change for now. We need some fixes even if we we want to keep the current behaviour, but these fixes are not for 2.6.30. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Jan Kratochvil Cc: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..42c317874cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -304,6 +304,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); + if (!child->exit_state) + wake_up_process(child); } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 6dc5f2a41759987e35e757ef00192e7b424563bb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 5 Jun 2009 12:36:28 +1000 Subject: perf_counter: Fix lockup with interrupting counters Commit 8e3747c1 ("perf_counter: Change data head from u32 to u64") changed the type of 'head' in struct perf_mmap_data from atomic_t to atomic_long_t, but missed converting one use of atomic_read on it to atomic_long_read. The effect of using atomic_read rather than atomic_long_read on powerpc (and other big-endian architectures) is that we get the high half of the 64-bit quantity, resulting in the cmpxchg retry loop in perf_output_begin spinning forever as soon as data->head becomes non-zero. On little-endian architectures such as x86 we would get the low half, resulting in a lockup once data->head becomes greater than 4G. This fixes it by using atomic_long_read rather than atomic_read. [ Impact: fix perfcounter lockup on PowerPC / big-endian systems ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18984.33964.21541.743096@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 195712e20d0..a5d3e2aedd2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2234,7 +2234,7 @@ static int perf_output_begin(struct perf_output_handle *handle, perf_output_lock(handle); do { - offset = head = atomic_read(&data->head); + offset = head = atomic_long_read(&data->head); head += size; } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); -- cgit v1.2.3 From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:04:55 +0200 Subject: perf_counter: Generate mmap events for install_special_mapping() In order to track the vdso also generate mmap events for install_special_mapping(). Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a5d3e2aedd2..37a5a241ca7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2255,7 +2255,7 @@ out: } static void perf_output_copy(struct perf_output_handle *handle, - void *buf, unsigned int len) + const void *buf, unsigned int len) { unsigned int pages_mask; unsigned int offset; @@ -2681,9 +2681,10 @@ void perf_counter_comm(struct task_struct *task) */ struct perf_mmap_event { - struct file *file; - char *file_name; - int file_size; + struct vm_area_struct *vma; + + const char *file_name; + int file_size; struct { struct perf_event_header header; @@ -2744,11 +2745,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct file *file = mmap_event->file; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + const char *name; if (file) { buf = kzalloc(PATH_MAX, GFP_KERNEL); @@ -2762,6 +2764,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { + name = arch_vma_name(mmap_event->vma); + if (name) + goto got_name; + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + name = strncpy(tmp, "//anon", sizeof(tmp)); goto got_name; } @@ -2791,8 +2802,7 @@ got_name: kfree(buf); } -void perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) +void __perf_counter_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -2800,12 +2810,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, return; mmap_event = (struct perf_mmap_event){ - .file = file, + .vma = vma, .event = { .header = { .type = PERF_EVENT_MMAP, }, - .start = addr, - .len = len, - .pgoff = pgoff, + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, }, }; -- cgit v1.2.3 From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:44:52 +0200 Subject: perf_counter: Change PERF_SAMPLE_CONFIG into PERF_SAMPLE_ID The purpose of PERF_SAMPLE_CONFIG was to identify the counters, since then we've added counter ids, use those instead. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 37a5a241ca7..e75b91a76a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2392,8 +2392,8 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_CONFIG) { - header.type |= PERF_SAMPLE_CONFIG; + if (sample_type & PERF_SAMPLE_ID) { + header.type |= PERF_SAMPLE_ID; header.size += sizeof(u64); } @@ -2439,8 +2439,8 @@ static void perf_counter_output(struct perf_counter *counter, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, addr); - if (sample_type & PERF_SAMPLE_CONFIG) - perf_output_put(&handle, counter->attr.config); + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); -- cgit v1.2.3 From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 15:05:43 +0200 Subject: perf_counter: Add PERF_SAMPLE_PERIOD In order to allow easy tracking of the period, also provide means of adding it to the sample data. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e75b91a76a5..f8390668c39 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2404,6 +2404,11 @@ static void perf_counter_output(struct perf_counter *counter, cpu_entry.cpu = raw_smp_processor_id(); } + if (sample_type & PERF_SAMPLE_PERIOD) { + header.type |= PERF_SAMPLE_PERIOD; + header.size += sizeof(u64); + } + if (sample_type & PERF_SAMPLE_GROUP) { header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + @@ -2445,6 +2450,9 @@ static void perf_counter_output(struct perf_counter *counter, if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(&handle, counter->hw.sample_period); + /* * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. */ @@ -2835,6 +2843,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) struct { struct perf_event_header header; u64 time; + u64 id; u64 period; } freq_event = { .header = { @@ -2843,6 +2852,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) .size = sizeof(freq_event), }, .time = sched_clock(), + .id = counter->id, .period = period, }; -- cgit v1.2.3 From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 18:01:29 +0200 Subject: perf_counter: Fix frequency adjustment for < HZ Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f8390668c39..47c92fb927f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1187,8 +1187,9 @@ static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; + struct hw_perf_counter *hwc; u64 interrupts, sample_period; - u64 events, period; + u64 events, period, freq; s64 delta; spin_lock(&ctx->lock); @@ -1196,8 +1197,10 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; - interrupts = counter->hw.interrupts; - counter->hw.interrupts = 0; + hwc = &counter->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); @@ -1208,20 +1211,35 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (!counter->attr.freq || !counter->attr.sample_freq) continue; - events = HZ * interrupts * counter->hw.sample_period; + if (counter->attr.sample_freq < HZ) { + freq = counter->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + events = freq * interrupts * hwc->sample_period; period = div64_u64(events, counter->attr.sample_freq); - delta = (s64)(1 + period - counter->hw.sample_period); + delta = (s64)(1 + period - hwc->sample_period); delta >>= 1; - sample_period = counter->hw.sample_period + delta; + sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; perf_log_period(counter, sample_period); - counter->hw.sample_period = sample_period; + hwc->sample_period = sample_period; } spin_unlock(&ctx->lock); } -- cgit v1.2.3 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 47c92fb927f..75ae76796df 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter, enum perf_event_types type, u32 event, struct pt_regs *regs) { - u64 event_config; - - event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event; - if (!perf_swcounter_is_counting(counter)) return 0; - if (counter->attr.config != event_config) + if (counter->attr.type != type) + return 0; + if (counter->attr.config != event) return 0; if (regs) { @@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->attr)) { + switch (counter->attr.config) { case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr, if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (perf_event_raw(attr)) { + if (attr->type == PERF_TYPE_RAW) { pmu = hw_perf_counter_init(counter); goto done; } - switch (perf_event_type(attr)) { + switch (attr->type) { case PERF_TYPE_HARDWARE: pmu = hw_perf_counter_init(counter); break; -- cgit v1.2.3 From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75ae76796df..5eacaaf3f9c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, switch (attr->type) { case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: pmu = hw_perf_counter_init(counter); break; -- cgit v1.2.3 From 3af968e066d593bc4dacc021715f3e95ddf0996f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jun 2009 12:31:53 -0700 Subject: async: Fix lack of boot-time console due to insufficient synchronization Our async work synchronization was broken by "async: make sure independent async domains can't accidentally entangle" (commit d5a877e8dd409d8c702986d06485c374b705d340), because it would report the wrong lowest active async ID when there was both running and pending async work. This caused things like no being able to read the root filesystem, resulting in missing console devices and inability to run 'init', causing a boot-time panic. This fixes it by properly returning the lowest pending async ID: if there is any running async work, that will have a lower ID than any pending work, and we should _not_ look at the pending work list. There were alternative patches from Jaswinder and James, but this one also cleans up the code by removing the pointless 'ret' variable and the unnecesary testing for an empty list around 'for_each_entry()' (if the list is empty, the for_each_entry() thing just won't execute). Fixes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13474 Reported-and-tested-by: Chris Clayton Cc: Jaswinder Singh Rajput Cc: James Bottomley Cc: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 50540301ed0..27235f5de19 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,23 +92,18 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; - async_cookie_t ret = next_cookie; /* begin with "infinity" value */ if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); - ret = entry->cookie; + return entry->cookie; } - if (!list_empty(&async_pending)) { - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) { - ret = entry->cookie; - break; - } - } + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; - return ret; + return next_cookie; /* "infinity" value */ } static async_cookie_t lowest_in_progress(struct list_head *running) -- cgit v1.2.3 From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Jun 2009 18:18:39 +0200 Subject: ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7102d7a2fad..22878b0d370 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -426,6 +426,8 @@ struct ring_buffer { atomic_t record_disabled; cpumask_var_t cpumask; + struct lock_class_key *reader_lock_key; + struct mutex mutex; struct ring_buffer_per_cpu **buffers; @@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self, * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) { struct ring_buffer *buffer; int bsize; @@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; /* need at least two pages */ if (buffer->pages == 1) @@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) kfree(buffer); return NULL; } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. -- cgit v1.2.3 From eaa958402ea40851097d051f52ba1bb7a885efe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Jun 2009 14:51:36 -0700 Subject: cpumask: alloc zeroed cpumask for static cpumask_var_ts These are defined as static cpumask_var_t so if MAXSMP is not used, they are cleared already. Avoid surprises when MAXSMP is enabled. Signed-off-by: Yinghai Lu Signed-off-by: Rusty Russell --- kernel/sched_cpupri.c | 2 +- kernel/sched_rt.c | 2 +- kernel/smp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574c..344712a5e3e 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) vec->count = 0; if (bootmem) alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f971..9bf0d2a7304 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568e..ad63d850120 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; -- cgit v1.2.3 From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Jun 2009 14:11:30 -0400 Subject: ring-buffer: fix ret in rb_add_time_stamp The update of ret got mistakenly added to the if statement of rb_try_to_discard. The variable ret should be 1 on commit and zero otherwise. [ Impact: fix compiler warning and real bug ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 22878b0d370..2e642b2b725 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* Darn, this is just wasted space */ event->time_delta = 0; event->array[0] = 0; - ret = 0; } + ret = 0; } *delta = 0; -- cgit v1.2.3 From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Jun 2009 13:43:05 +0800 Subject: tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 5 +++- kernel/trace/blktrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 06b85850fab..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e3abf55bc8e..7bd6a9893c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,10 +23,14 @@ #include #include #include -#include #include + +#include + #include "trace_output.h" +#ifdef CONFIG_BLK_DEV_IO_TRACE + static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; @@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (!blk_pc_request(rq)) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & 1 << BIO_RW_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & 1 << BIO_RW_AHEAD) + rwbs[i++] = 'A'; + if (rw & 1 << BIO_RW_BARRIER) + rwbs[i++] = 'B'; + if (rw & 1 << BIO_RW_SYNCIO) + rwbs[i++] = 'S'; + if (rw & 1 << BIO_RW_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +void blk_fill_rwbs_rq(char *rwbs, struct request *rq) +{ + int rw = rq->cmd_flags & 0x03; + int bytes; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) + bytes = rq->data_len; + else + bytes = rq->hard_nr_sectors << 9; + + blk_fill_rwbs(rwbs, rw, bytes); +} + +#endif /* CONFIG_EVENT_TRACING */ + -- cgit v1.2.3 From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Jun 2009 19:09:45 -0400 Subject: tracing: add trace_seq_vprint interface The code to update the print formats for events requires a vprintf format in the trace_seq. This patch adds that interface. Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 425725c1622..c05aff465dc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { int len = (PAGE_SIZE - 1) - s->len; -- cgit v1.2.3 From ad6ccfad6f759a5d657dabe2071a8f2a503fcc84 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Tue, 12 May 2009 13:43:35 -0700 Subject: kernel/kallsyms.c: replace deprecated __initcall with device_initcall and fix whitespace Fix coding style whitespace issues and replace __initcall with device_initcall. Fixed multi-line comments as per coding style. Errors as reported by checkpatch.pl :- Before: total: 14 errors, 14 warnings, 487 lines checked After : total: 0 errors, 8 warnings, 507 lines checked Compile tested binary verified as :- Before: text data bss dec hex filename 2405 4 0 2409 969 kernel/kallsyms.o After : text data bss dec hex filename 2405 4 0 2409 969 kernel/kallsyms.o Signed-off-by: Manish Katiyar Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- kernel/kallsyms.c | 134 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 374faf9bfdc..3a29dbe7898 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,12 +30,16 @@ #define all_var 0 #endif -/* These will be re-linked against their real values during the second link stage */ +/* + * These will be re-linked against their real values + * during the second link stage. + */ extern const unsigned long kallsyms_addresses[] __attribute__((weak)); extern const u8 kallsyms_names[] __attribute__((weak)); -/* tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV) +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). */ extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); @@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr) return is_kernel_text(addr) || is_kernel_inittext(addr); } -/* expand a compressed symbol data into the resulting uncompressed string, - given the offset to where the symbol is in the compressed stream */ +/* + * Expand a compressed symbol data into the resulting uncompressed string, + * given the offset to where the symbol is in the compressed stream. + */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; const u8 *tptr, *data; - /* get the compressed symbol length from the first symbol byte */ + /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; - /* update the offset to return the offset for the next symbol on - * the compressed stream */ + /* + * Update the offset to return the offset for the next symbol on + * the compressed stream. + */ off += len + 1; - /* for every byte on the compressed symbol data, copy the table - entry for that byte */ - while(len) { - tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; + /* + * For every byte on the compressed symbol data, copy the table + * entry for that byte. + */ + while (len) { + tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { - if(skipped_first) { + if (skipped_first) { *result = *tptr; result++; } else @@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) *result = '\0'; - /* return to offset to the next symbol */ + /* Return to offset to the next symbol. */ return off; } -/* get symbol type information. This is encoded as a single char at the - * begining of the symbol name */ +/* + * Get symbol type information. This is encoded as a single char at the + * beginning of the symbol name. + */ static char kallsyms_get_symbol_type(unsigned int off) { - /* get just the first code, look it up in the token table, and return the - * first char from this token */ - return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; + /* + * Get just the first code, look it up in the token table, + * and return the first char from this token. + */ + return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } -/* find the offset on the compressed stream given and index in the - * kallsyms array */ +/* + * Find the offset on the compressed stream given and index in the + * kallsyms array. + */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i; - /* use the closest marker we have. We have markers every 256 positions, - * so that should be close enough */ - name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; + /* + * Use the closest marker we have. We have markers every 256 positions, + * so that should be close enough. + */ + name = &kallsyms_names[kallsyms_markers[pos >> 8]]; - /* sequentially scan all the symbols up to the point we're searching for. - * Every symbol is stored in a [][ bytes of data] format, so we - * just need to add the len to the current pointer for every symbol we - * wish to skip */ - for(i = 0; i < (pos&0xFF); i++) + /* + * Sequentially scan all the symbols up to the point we're searching + * for. Every symbol is stored in a [][ bytes of data] format, + * so we just need to add the len to the current pointer for every + * symbol we wish to skip. + */ + for (i = 0; i < (pos & 0xFF); i++) name = name + (*name) + 1; return name - kallsyms_names; @@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr, /* This kernel should never had been booted. */ BUG_ON(!kallsyms_addresses); - /* do a binary search on the sorted kallsyms_addresses array */ + /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; @@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr, } /* - * search for the first aliased symbol. Aliased - * symbols are symbols with the same address + * Search for the first aliased symbol. Aliased + * symbols are symbols with the same address. */ while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) --low; symbol_start = kallsyms_addresses[low]; - /* Search for next non-aliased symbol */ + /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { if (kallsyms_addresses[i] > symbol_start) { symbol_end = kallsyms_addresses[i]; @@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr, } } - /* if we found no next symbol, we use the end of the section */ + /* If we found no next symbol, we use the end of the section. */ if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; @@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, /* * Lookup an address - * - modname is set to NULL if it's in the kernel - * - we guarantee that the returned name is valid until we reschedule even if - * it resides in a module - * - we also guarantee that modname will be valid until rescheduled + * - modname is set to NULL if it's in the kernel. + * - We guarantee that the returned name is valid until we reschedule even if. + * It resides in a module. + * - We also guarantee that modname will be valid until rescheduled. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, @@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr, return namebuf; } - /* see if it's in a module */ + /* See if it's in a module. */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); } @@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname) kallsyms_expand_symbol(get_symbol_offset(pos), symname); return 0; } - /* see if it's in a module */ + /* See if it's in a module. */ return lookup_module_symbol_name(addr, symname); } @@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, modname[0] = '\0'; return 0; } - /* see if it's in a module */ + /* See if it's in a module. */ return lookup_module_symbol_attrs(addr, size, offset, modname, name); } @@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address) return len; } +EXPORT_SYMBOL_GPL(sprint_symbol); /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) @@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address) printk(fmt, buffer); } +EXPORT_SYMBOL(__print_symbol); /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ -struct kallsym_iter -{ +struct kallsym_iter { loff_t pos; unsigned long value; - unsigned int nameoff; /* If iterating in core kernel symbols */ + unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; @@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos) iter->pos = pos; return get_ksymbol_mod(iter); } - + /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) reset_iter(iter, pos); @@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p) { struct kallsym_iter *iter = m->private; - /* Some debugging symbols have no name. Ignore them. */ + /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; if (iter->module_name[0]) { char type; - /* Label it "global" if it is exported, - * "local" if not exported. */ + /* + * Label it "global" if it is exported, + * "local" if not exported. + */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2*sizeof(void*)), + (int)(2 * sizeof(void *)), iter->value, type, iter->name, iter->module_name); } else seq_printf(m, "%0*lx %c %s\n", - (int)(2*sizeof(void*)), + (int)(2 * sizeof(void *)), iter->value, iter->type, iter->name); return 0; } @@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = { static int kallsyms_open(struct inode *inode, struct file *file) { - /* We keep iterator in m->private, since normal case is to + /* + * We keep iterator in m->private, since normal case is to * s_start from where we left off, so we avoid doing - * using get_symbol_offset for every symbol */ + * using get_symbol_offset for every symbol. + */ struct kallsym_iter *iter; int ret; @@ -500,7 +525,4 @@ static int __init kallsyms_init(void) proc_create("kallsyms", 0444, NULL, &kallsyms_operations); return 0; } -__initcall(kallsyms_init); - -EXPORT_SYMBOL(__print_symbol); -EXPORT_SYMBOL_GPL(sprint_symbol); +device_initcall(kallsyms_init); -- cgit v1.2.3 From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Jun 2009 17:29:07 -0400 Subject: tracing: add protection around module events unload When reading the trace buffer, there is a race that when a module is unloaded it removes events that is stilled referenced in the buffers. This patch adds the protection around the unloading of the events from modules and the reading of the trace buffers. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- kernel/trace/trace_output.c | 15 ++++++++++++--- kernel/trace/trace_output.h | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6c81f9c2142..aa08be69a1b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod) struct ftrace_event_call *call, *p; bool found = false; + down_write(&trace_event_mutex); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; ftrace_event_enable_disable(call, 0); if (call->event) - unregister_ftrace_event(call->event); + __unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); trace_destroy_fields(call); @@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod) */ if (found) tracing_reset_current_online_cpus(); + up_write(&trace_event_mutex); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c05aff465dc..7938f3ae93e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); @@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event) } EXPORT_SYMBOL_GPL(register_ftrace_event); +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} + /** * unregister_ftrace_event - remove a no longer used event * @event: the event to remove @@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); int unregister_ftrace_event(struct trace_event *event) { down_write(&trace_event_mutex); - hlist_del(&event->node); - list_del(&event->list); + __unregister_ftrace_event(event); up_write(&trace_event_mutex); return 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index ac240e76eb0..d38bec4a9c3 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; + #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) -- cgit v1.2.3 From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 130 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5eacaaf3f9c..51c571ee4d0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) static void perf_log_throttle(struct perf_counter *counter, int enable); static void perf_log_period(struct perf_counter *counter, u64 period); -static void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_adjust_period(struct perf_counter *counter, u64 events) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, counter->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + perf_log_period(counter, sample_period); + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; struct hw_perf_counter *hwc; - u64 interrupts, sample_period; - u64 events, period, freq; - s64 delta; + u64 interrupts, freq; spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = hwc->interrupts; hwc->interrupts = 0; + /* + * unthrottle counters on the tick + */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); counter->pmu->unthrottle(counter); @@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (!counter->attr.freq || !counter->attr.sample_freq) continue; + /* + * if the specified freq < HZ then we need to skip ticks + */ if (counter->attr.sample_freq < HZ) { freq = counter->attr.sample_freq; @@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) } else freq = HZ; - events = freq * interrupts * hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(1 + period - hwc->sample_period); - delta >>= 1; - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; + perf_adjust_period(counter, freq * interrupts); - perf_log_period(counter, sample_period); - - hwc->sample_period = sample_period; + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + counter->pmu->disable(counter); + atomic_set(&hwc->period_left, 0); + counter->pmu->enable(counter); + perf_enable(); + } } spin_unlock(&ctx->lock); } @@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = curr->perf_counter_ctxp; - perf_adjust_freq(&cpuctx->ctx); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) - perf_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); if (ctx) @@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { + perf_log_period(counter, value); + counter->attr.sample_period = value; counter->hw.sample_period = value; - - perf_log_period(counter, value); } unlock: spin_unlock_irq(&ctx->lock); @@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma) * event flow. */ +struct freq_event { + struct perf_event_header header; + u64 time; + u64 id; + u64 period; +}; + static void perf_log_period(struct perf_counter *counter, u64 period) { struct perf_output_handle handle; + struct freq_event event; int ret; - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; - } freq_event = { + if (counter->hw.sample_period == period) + return; + + if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) + return; + + event = (struct freq_event) { .header = { .type = PERF_EVENT_PERIOD, .misc = 0, - .size = sizeof(freq_event), + .size = sizeof(event), }, .time = sched_clock(), .id = counter->id, .period = period, }; - if (counter->hw.sample_period == period) - return; - - ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); + ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); if (ret) return; - perf_output_put(&handle, freq_event); + perf_output_put(&handle, event); perf_output_end(&handle); } @@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter, { int events = atomic_read(&counter->event_limit); int throttle = counter->pmu->unthrottle != NULL; + struct hw_perf_counter *hwc = &counter->hw; int ret = 0; if (!throttle) { - counter->hw.interrupts++; + hwc->interrupts++; } else { - if (counter->hw.interrupts != MAX_INTERRUPTS) { - counter->hw.interrupts++; - if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { - counter->hw.interrupts = MAX_INTERRUPTS; + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) { + hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(counter, 0); ret = 1; } @@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter, } } + if (counter->attr.freq) { + u64 now = sched_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + } + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.sample_period = counter->attr.sample_period; return &perf_ops_generic; } @@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr, pmu = NULL; hwc = &counter->hw; + hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) - hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq); - else - hwc->sample_period = attr->sample_period; + hwc->sample_period = 1; + + atomic64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_SAMPLE_GROUP on inherited counters @@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter, else child_counter->state = PERF_COUNTER_STATE_OFF; + if (parent_counter->attr.freq) + child_counter->hw.sample_period = parent_counter->hw.sample_period; + /* * Link it up in the child's context: */ -- cgit v1.2.3 From 66fff22483d8542dfb4d61a28d21277bbde321e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 22:53:37 +0200 Subject: perf_counter: Annotate exit ctx recursion Ever since Paul fixed it to unclone the context before taking the ctx->lock this became a false positive, annotate it away. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 51c571ee4d0..ae591a1275a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3879,7 +3879,18 @@ void perf_counter_exit_task(struct task_struct *child) spin_unlock(&child_ctx->lock); local_irq_restore(flags); - mutex_lock(&child_ctx->mutex); + /* + * We can recurse on the same lock type through: + * + * __perf_counter_exit_task() + * sync_child_counter() + * fput(parent_counter->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, -- cgit v1.2.3 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ae591a1275a..4fe85e804f4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +static void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int ret; u64 sample_type = counter->attr.sample_type; @@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter, header.size = sizeof(header); header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= perf_misc_flags(regs); + header.misc |= perf_misc_flags(data->regs); if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(regs); + ip = perf_instruction_pointer(data->regs); header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } @@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter, } if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(regs); + callchain = perf_callchain(data->regs); if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); @@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, time); if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, addr); + perf_output_put(&handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(&handle, counter->id); @@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) * Generic counter overflow handling. */ -int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int events = atomic_read(&counter->event_limit); int throttle = counter->pmu->unthrottle != NULL; @@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs, addr); + perf_counter_output(counter, nmi, data); return ret; } @@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; struct perf_counter *counter; - struct pt_regs *regs; u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); counter->pmu->read(counter); - regs = get_irq_regs(); + data.addr = 0; + data.regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !regs) && + if ((counter->attr.exclude_kernel || !data.regs) && !counter->attr.exclude_user) - regs = task_pt_regs(current); + data.regs = task_pt_regs(current); - if (regs) { - if (perf_counter_overflow(counter, 0, regs, 0)) + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) ret = HRTIMER_NORESTART; } @@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + }; + perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs, addr)) + if (perf_counter_overflow(counter, nmi, &data)) /* soft-disable the counter */ ; -- cgit v1.2.3 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4fe85e804f4..8b89b40bd0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, perf_output_put(&handle, cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, counter->hw.sample_period); + perf_output_put(&handle, data->period); /* * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. @@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter) if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; } if (unlikely(left <= 0)) { left += period; atomic64_add(period, &hwc->period_left); + hwc->last_period = period; } atomic64_set(&hwc->prev_count, -left); @@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { - .regs = regs, - .addr = addr, + .regs = regs, + .addr = addr, + .period = counter->hw.last_period, }; perf_swcounter_update(counter); -- cgit v1.2.3 From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:18:36 +0200 Subject: perf_counter: More paranoia settings Rename the perf_counter_priv knob to perf_counter_paranoia (because priv can be read as private, as opposed to privileged) and provide one more level: 0 - permissive 1 - restrict cpu counters to privilidged contexts 2 - restrict kernel-mode code counting and profiling Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 +++++++++++++++++++++++-- kernel/sysctl.c | 6 +++--- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8b89b40bd0f..63f1987c1c1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -43,7 +43,23 @@ static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; -int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +/* + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */ + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_counter_paranoid > 1; +} + int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ @@ -1385,7 +1401,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) @@ -3618,6 +3634,11 @@ SYSCALL_DEFINE5(perf_counter_open, if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) return -EFAULT; + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + /* * Get the target context (task or percpu): */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0c4bf863afa..344a65981de 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -916,9 +916,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PERF_COUNTERS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_privileged", - .data = &sysctl_perf_counter_priv, - .maxlen = sizeof(sysctl_perf_counter_priv), + .procname = "perf_counter_paranoid", + .data = &sysctl_perf_counter_paranoid, + .maxlen = sizeof(sysctl_perf_counter_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, -- cgit v1.2.3 From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:25:05 +0200 Subject: perf_counter: Rename perf_counter_limit sysctl Rename perf_counter_limit to perf_counter_max_sample_rate and prohibit creation of counters with a known higher sample frequency. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 27 +++++++++++++++++++-------- kernel/sysctl.c | 6 +++--- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 63f1987c1c1..3b2829de559 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,11 +44,12 @@ static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; /* - * 0 - not paranoid - * 1 - disallow cpu counters to unpriv - * 2 - disallow kernel profiling to unpriv + * perf counter paranoia level: + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv */ -int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_paranoid __read_mostly; static inline bool perf_paranoid_cpu(void) { @@ -61,7 +62,11 @@ static inline bool perf_paranoid_kernel(void) } int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ -int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; static atomic64_t perf_counter_id; @@ -1244,7 +1249,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_limit/HZ; + interrupts = 2*sysctl_perf_counter_sample_rate/HZ; } if (!counter->attr.freq || !counter->attr.sample_freq) @@ -1682,7 +1687,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) spin_lock_irq(&ctx->lock); if (counter->attr.freq) { - if (value > sysctl_perf_counter_limit) { + if (value > sysctl_perf_counter_sample_rate) { ret = -EINVAL; goto unlock; } @@ -2979,7 +2984,8 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, } else { if (hwc->interrupts != MAX_INTERRUPTS) { hwc->interrupts++; - if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) { + if (HZ * hwc->interrupts > + (u64)sysctl_perf_counter_sample_rate) { hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(counter, 0); ret = 1; @@ -3639,6 +3645,11 @@ SYSCALL_DEFINE5(perf_counter_open, return -EACCES; } + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_counter_sample_rate) + return -EINVAL; + } + /* * Get the target context (task or percpu): */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 344a65981de..9fd4e436b69 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -932,9 +932,9 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_int_limit", - .data = &sysctl_perf_counter_limit, - .maxlen = sizeof(sysctl_perf_counter_limit), + .procname = "perf_counter_max_sample_rate", + .data = &sysctl_perf_counter_sample_rate, + .maxlen = sizeof(sysctl_perf_counter_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, -- cgit v1.2.3 From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 13:19:29 +0200 Subject: perf_counter: Rename enums Rename the perf enums to be in the 'perf_' namespace and strictly enumerate the ABI bits. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b2829de559..c02535bed26 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3162,7 +3162,7 @@ static int perf_swcounter_is_counting(struct perf_counter *counter) } static int perf_swcounter_match(struct perf_counter *counter, - enum perf_event_types type, + enum perf_type_id type, u32 event, struct pt_regs *regs) { if (!perf_swcounter_is_counting(counter)) @@ -3194,7 +3194,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_event_types type, u32 event, + enum perf_type_id type, u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { @@ -3225,7 +3225,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void __perf_swcounter_event(enum perf_event_types type, u32 event, +static void __perf_swcounter_event(enum perf_type_id type, u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { -- cgit v1.2.3 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c02535bed26..8859b97390e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int do_switch = 1; regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu) struct perf_counter_context *ctx; perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_CPU_MIGRATIONS, + PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); ctx = perf_pin_task_context(task); if (ctx) { perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_CPU_MIGRATIONS, + PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); perf_unpin_context(ctx); } @@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * events. */ switch (counter->attr.config) { - case PERF_COUNT_CPU_CLOCK: + case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_TASK_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_PAGE_FAULTS: - case PERF_COUNT_PAGE_FAULTS_MIN: - case PERF_COUNT_PAGE_FAULTS_MAJ: - case PERF_COUNT_CONTEXT_SWITCHES: - case PERF_COUNT_CPU_MIGRATIONS: + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: pmu = &perf_ops_generic; break; } -- cgit v1.2.3 From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:57:55 +0200 Subject: perf_counter: Add counter->id to the throttle event So as to be able to distuinguish between multiple counters. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8859b97390e..ef5d8a5b245 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2950,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct { struct perf_event_header header; u64 time; + u64 id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), + .time = sched_clock(), + .id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); -- cgit v1.2.3 From 4f2294b6dc88d99295230d97fef2c9863cec44c3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 11 Jun 2009 13:23:20 +0100 Subject: kmemleak: Add modules support This patch handles the kmemleak operations needed for modules loading so that memory allocations from inside a module are properly tracked. Signed-off-by: Catalin Marinas --- kernel/module.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2383e60fcf3..5cd55ab15da 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -430,6 +431,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned long extra; unsigned int i; void *ptr; + int cpu; if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", @@ -459,6 +461,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, if (!split_block(i, size)) return NULL; + /* add the per-cpu scanning areas */ + for_each_possible_cpu(cpu) + kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, + GFP_KERNEL); + /* Mark allocated */ pcpu_size[i] = -pcpu_size[i]; return ptr; @@ -473,6 +480,7 @@ static void percpu_modfree(void *freeme) { unsigned int i; void *ptr = __per_cpu_start + block_size(pcpu_size[0]); + int cpu; /* First entry is core kernel percpu data. */ for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -484,6 +492,10 @@ static void percpu_modfree(void *freeme) BUG(); free: + /* remove the per-cpu scanning areas */ + for_each_possible_cpu(cpu) + kmemleak_free(freeme + per_cpu_offset(cpu)); + /* Merge with previous? */ if (pcpu_size[i-1] >= 0) { pcpu_size[i-1] += pcpu_size[i]; @@ -1876,6 +1888,36 @@ static void *module_alloc_update_bounds(unsigned long size) return ret; } +#ifdef CONFIG_DEBUG_KMEMLEAK +static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, char *secstrings) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod->module_core, (unsigned long)mod - + (unsigned long)mod->module_core, + sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < hdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 + && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + continue; + + kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - + (unsigned long)mod->module_core, + sechdrs[i].sh_size, GFP_KERNEL); + } +} +#else +static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, char *secstrings) +{ +} +#endif + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static noinline struct module *load_module(void __user *umod, @@ -2046,6 +2088,12 @@ static noinline struct module *load_module(void __user *umod, /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); if (!ptr) { err = -ENOMEM; goto free_percpu; @@ -2054,6 +2102,13 @@ static noinline struct module *load_module(void __user *umod, mod->module_core = ptr; ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); if (!ptr && mod->init_size) { err = -ENOMEM; goto free_core; @@ -2084,6 +2139,7 @@ static noinline struct module *load_module(void __user *umod, } /* Module has been moved. */ mod = (void *)sechdrs[modindex].sh_addr; + kmemleak_load_module(mod, hdr, sechdrs, secstrings); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), -- cgit v1.2.3 From 36b7b6d465489c4754c4fd66fcec6086eba87896 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 10 Jun 2009 23:42:36 +0300 Subject: sched: use kzalloc() instead of the bootmem allocator Now that kmem_cache_init() happens before sched_init(), we should use kzalloc() and not the bootmem allocator. Signed-off-by: Pekka Enberg --- kernel/sched.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 14c447ae5d5..a9ff9533355 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include @@ -7782,21 +7781,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { + gfp_t gfp = GFP_KERNEL; + memset(rd, 0, sizeof(*rd)); - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } + if (bootmem) + gfp = GFP_NOWAIT; - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->span, gfp)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->online, gfp)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; if (cpupri_init(&rd->cpupri, false) != 0) @@ -9123,7 +9119,7 @@ void __init sched_init(void) * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; -- cgit v1.2.3 From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 May 2009 15:10:58 +0300 Subject: x86: remove some alloc_bootmem_cpumask_var calling Now that we set up the slab allocator earlier, we can get rid of some alloc_bootmem_cpumask_var() calls in boot code. Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/cpuset.c | 2 +- kernel/profile.c | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 026faccca86..d5a7e17474e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { - alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); top_cpuset.mems_generation = cpuset_mems_generation++; return 0; diff --git a/kernel/profile.c b/kernel/profile.c index 7724e0409ba..28cf26ad2d2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -111,12 +111,6 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; buffer_bytes = prof_len*sizeof(atomic_t); - if (!slab_is_available()) { - prof_buffer = alloc_bootmem(buffer_bytes); - alloc_bootmem_cpumask_var(&prof_cpu_mask); - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - return 0; - } if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; -- cgit v1.2.3 From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 May 2009 18:14:40 -0700 Subject: irq/cpumask: make memoryless node zero happy Don't hardcode to node zero for early boot IRQ setup memory allocations. [ penberg@cs.helsinki.fi: minor cleanups ] Cc: Ingo Molnar Cc: Johannes Weiner Cc: Linus Torvalds Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/irq/handle.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a60018402f4..e161999b668 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -150,6 +150,7 @@ int __init early_irq_init(void) { struct irq_desc *desc; int legacy_count; + int node; int i; init_irq_default_affinity(); @@ -160,20 +161,20 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); /* allocate based on nr_cpu_ids */ - /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ - kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int)); + kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int), GFP_NOWAIT, node); for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], node, true); init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } -- cgit v1.2.3 From 4bdddf8ff9bbb8aa7b4d7847586202bd25842c90 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 11 Jun 2009 08:35:27 +0300 Subject: sched: use alloc_cpumask_var() instead of alloc_bootmem_cpumask_var() Slab is initialized when sched_init() runs now so lets use alloc_cpumask_var(). Cc: Ingo Molnar Cc: Linus Torvalds Cc: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a9ff9533355..12cc09cf514 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9310,13 +9310,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); + alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); - alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); + alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); + alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ scheduler_running = 1; -- cgit v1.2.3 From 0fb530291621c8b8a1b16abeeab05d9262489f71 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 11 Jun 2009 08:41:22 +0300 Subject: sched: use slab in cpupri_init() Lets not use the bootmem allocator in cpupri_init() as slab is already up when it is run. Cc: Ingo Molnar Cc: Linus Torvalds Cc: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/sched.c | 2 +- kernel/sched_cpupri.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 12cc09cf514..dcf2dc28931 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7795,7 +7795,7 @@ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) if (!alloc_cpumask_var(&rd->rto_mask, gfp)) goto free_online; - if (cpupri_init(&rd->cpupri, false) != 0) + if (cpupri_init(&rd->cpupri, bootmem) != 0) goto free_rto_mask; return 0; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 344712a5e3e..7deffc9f0e5 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) */ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { + gfp_t gfp = GFP_KERNEL; int i; + if (bootmem) + gfp = GFP_NOWAIT; + memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) spin_lock_init(&vec->lock); vec->count = 0; - if (bootmem) - alloc_bootmem_cpumask_var(&vec->mask); - else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; } -- cgit v1.2.3 From 22fb4e71e646695c7e0f379ada66b372c2d1aa1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 11 Jun 2009 14:46:49 +0300 Subject: irq: use kcalloc() instead of the bootmem allocator Fixes the following problem: [ 0.000000] Experimental hierarchical RCU init done. [ 0.000000] NR_IRQS:4352 nr_irqs:256 [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e() [ 0.000000] Hardware name: To Be Filled By O.E.M. [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #59709 [ 0.000000] Call Trace: [ 0.000000] [] ? alloc_arch_preferred_bootmem+0x40/0x7e [ 0.000000] [] warn_slowpath_common+0x88/0xcb [ 0.000000] [] warn_slowpath_null+0x27/0x3d [ 0.000000] [] alloc_arch_preferred_bootmem+0x40/0x7e [ 0.000000] [] ___alloc_bootmem_nopanic+0x4e/0xec [ 0.000000] [] ___alloc_bootmem+0x20/0x61 [ 0.000000] [] __alloc_bootmem+0x1e/0x34 [ 0.000000] [] early_irq_init+0x6d/0x118 [ 0.000000] [] ? early_idt_handler+0x0/0x71 [ 0.000000] [] start_kernel+0x192/0x394 [ 0.000000] [] ? early_idt_handler+0x0/0x71 [ 0.000000] [] x86_64_start_reservations+0xb4/0xcf [ 0.000000] [] ? __init_begin+0x0/0x140 [ 0.000000] [] x86_64_start_kernel+0x158/0x17b [ 0.000000] ---[ end trace a7919e7f17c0a725 ]--- [ 0.000000] Fast TSC calibration using PIT [ 0.000000] Detected 2002.510 MHz processor. [ 0.004000] Console: colour VGA+ 80x25 Reported-by: Ingo Molnar Signed-off-by: Pekka Enberg --- kernel/irq/handle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e161999b668..10457854123 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -164,7 +164,7 @@ int __init early_irq_init(void) node = first_online_node; /* allocate irq_desc_ptrs array based on nr_irqs */ - irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); /* allocate based on nr_cpu_ids */ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * -- cgit v1.2.3 From b415c49a864dab8ee90713833d642dd461eccae9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Jun 2009 13:12:55 +0100 Subject: slow_work_thread() should do the exclusive wait slow_work_thread() sleeps on slow_work_thread_wq without WQ_FLAG_EXCLUSIVE, this means that slow_work_enqueue()->__wake_up(nr_exclusive => 1) wakes up all kslowd threads. This is not what we want, so we change slow_work_thread() to use prepare_to_wait_exclusive() instead. Signed-off-by: Oleg Nesterov Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index b28d19135f4..521ed2004d6 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -372,8 +372,8 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - prepare_to_wait(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, + TASK_INTERRUPTIBLE); if (!freezing(current) && !slow_work_threads_should_exit && !slow_work_available(vsmax) && -- cgit v1.2.3 From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:28:19 -0400 Subject: Switch collect_mounts() to struct path Signed-off-by: Al Viro --- kernel/audit_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 6e7351739a8..1f6396d7668 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -568,7 +568,7 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(path.mnt, path.dentry); + root_mnt = collect_mounts(&path); path_put(&path); if (!root_mnt) goto skip_it; @@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(path.mnt, path.dentry); + mnt = collect_mounts(&path); path_put(&path); if (!mnt) { err = -ENOMEM; @@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new) err = kern_path(new, 0, &path); if (err) return err; - tagged = collect_mounts(path.mnt, path.dentry); + tagged = collect_mounts(&path); path_put(&path); if (!tagged) return -ENOMEM; -- cgit v1.2.3 From 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Tue, 12 May 2009 15:10:54 +0200 Subject: Push BKL down into ->remount_fs() [xfs, btrfs, capifs, shmem don't need BKL, exempt] Signed-off-by: Alessio Igor Bogani Signed-off-by: Al Viro --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7267bfd376..3fb789f6df9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; + lock_kernel(); mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.release_agent); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + unlock_kernel(); return ret; } -- cgit v1.2.3 From 28be225b23b115573e0ecc8ef9996f42a1652f74 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 12 Jun 2009 11:33:02 +0300 Subject: irq: slab alloc for default irq_affinity Ingo had [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71() [ 0.000000] Hardware name: System Product Name [ 0.000000] Modules linked in: [ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip-03087-g0bb2618-dirty #52506 [ 0.000000] Call Trace: [ 0.000000] [<81032588>] warn_slowpath_common+0x60/0x90 [ 0.000000] [<810325c5>] warn_slowpath_null+0xd/0x10 [ 0.000000] [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71 [ 0.000000] [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a [ 0.000000] [<81050a0a>] ? lock_release+0xac/0xb2 [ 0.000000] [<819d1d4c>] ___alloc_bootmem+0xe/0x2d [ 0.000000] [<819d1e9f>] __alloc_bootmem+0xa/0xc [ 0.000000] [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26 [ 0.000000] [<819d0cc8>] early_irq_init+0x15/0x10d [ 0.000000] [<819bb75a>] start_kernel+0x167/0x326 [ 0.000000] [<819bb06b>] __init_begin+0x6b/0x70 [ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]--- [ 0.000000] NR_IRQS:2304 nr_irqs:424 [ 0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000 we need to update init_irq_default_affinity Signed-off-by: Yinghai Lu Signed-off-by: Pekka Enberg --- kernel/irq/handle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 10457854123..065205bdd92 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) static void __init init_irq_default_affinity(void) { - alloc_bootmem_cpumask_var(&irq_default_affinity); + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); cpumask_setall(irq_default_affinity); } #else -- cgit v1.2.3 From 9a71af2c3627b379b7c31917a7f6ee0d29bc559b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:46:53 -0600 Subject: module_param: invbool should take a 'bool', not an 'int' It takes an 'int' for historical reasons, and there are only two users: simply switch it over to bool. The other user (uvesafb.c) will get a (harmless-on-x86) warning until the next patch is applied. Cc: Brad Douglas Cc: Michal Januszewski Signed-off-by: Rusty Russell --- kernel/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index de273ec85bd..023abbf5f89 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -272,13 +272,13 @@ int param_set_invbool(const char *val, struct kernel_param *kp) dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) - *(int *)kp->arg = !boolval; + *(bool *)kp->arg = !boolval; return ret; } int param_get_invbool(char *buffer, struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); } /* We break the rule and mangle the string. */ -- cgit v1.2.3 From 45fcc70c0b6ee0c508e1fdb5fef735c3546803f4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:46:56 -0600 Subject: module_param: split perm field into flags and perm Impact: cleanup Rather than hack KPARAM_KMALLOCED into the perm field, separate it out. Since the perm field was 32 bits and only needs 16, we don't add bloat. Signed-off-by: Rusty Russell --- kernel/params.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 023abbf5f89..b4660dc13db 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,9 +24,6 @@ #include #include -/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */ -#define KPARAM_KMALLOCED 0x80000000 - #if 0 #define DEBUGP printk #else @@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->perm & KPARAM_KMALLOCED) + if (kp->flags & KPARAM_KMALLOCED) kfree(*(char **)kp->arg); /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->perm |= KPARAM_KMALLOCED; + kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); if (!kp->arg) return -ENOMEM; @@ -591,7 +588,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) unsigned int i; for (i = 0; i < num; i++) - if (params[i].perm & KPARAM_KMALLOCED) + if (params[i].flags & KPARAM_KMALLOCED) kfree(*(char **)params[i].arg); } -- cgit v1.2.3 From fddd520122953550ec2c8b60e7ca0d0f0d115d97 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:46:57 -0600 Subject: module_param: allow 'bool' module_params to be bool, not just int. Impact: API cleanup For historical reasons, 'bool' parameters must be an int, not a bool. But there are around 600 users, so a conversion seems like useless churn. So we use __same_type() to distinguish, and handle both cases. Signed-off-by: Rusty Russell --- kernel/params.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b4660dc13db..7f6912ced2b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -238,35 +238,54 @@ int param_get_charp(char *buffer, struct kernel_param *kp) return sprintf(buffer, "%s", *((char **)kp->arg)); } +/* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, struct kernel_param *kp) { + bool v; + /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ switch (val[0]) { case 'y': case 'Y': case '1': - *(int *)kp->arg = 1; - return 0; + v = true; + break; case 'n': case 'N': case '0': - *(int *)kp->arg = 0; - return 0; + v = false; + break; + default: + return -EINVAL; } - return -EINVAL; + + if (kp->flags & KPARAM_ISBOOL) + *(bool *)kp->arg = v; + else + *(int *)kp->arg = v; + return 0; } int param_get_bool(char *buffer, struct kernel_param *kp) { + bool val; + if (kp->flags & KPARAM_ISBOOL) + val = *(bool *)kp->arg; + else + val = *(int *)kp->arg; + /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); + return sprintf(buffer, "%c", val ? 'Y' : 'N'); } +/* This one must be bool. */ int param_set_invbool(const char *val, struct kernel_param *kp) { - int boolval, ret; + int ret; + bool boolval; struct kernel_param dummy; dummy.arg = &boolval; + dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; -- cgit v1.2.3 From ad6561dffa17f17bb68d7207d422c26c381c4313 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 21:47:03 -0600 Subject: module: trim exception table on init free. It's theoretically possible that there are exception table entries which point into the (freed) init text of modules. These could cause future problems if other modules get loaded into that memory and cause an exception as we'd see the wrong fixup. The only case I know of is kvm-intel.ko (when CONFIG_CC_OPTIMIZE_FOR_SIZE=n). Amerigo fixed this long-standing FIXME in the x86 version, but this patch is more general. This implements trim_init_extable(); most archs are simple since they use the standard lib/extable.c sort code. Alpha and IA64 use relative addresses in their fixups, so thier trimming is a slight variation. Sparc32 is unique; it doesn't seem to define ARCH_HAS_SORT_EXTABLE, yet it defines its own sort_extable() which overrides the one in lib. It doesn't sort, so we have to mark deleted entries instead of actually trimming them. Inspired-by: Amerigo Wang Signed-off-by: Rusty Russell Cc: linux-alpha@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-ia64@vger.kernel.org --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 35f7de00bf0..e4ab36ce767 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2455,6 +2455,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); + trim_init_extable(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.2.3 From 081fad86178ec0f64f32f1bd04cf4aad22714fb9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 17:57:21 +0200 Subject: perf_counter: Remove PERF_TYPE_RAW special casing The PERF_TYPE_RAW special case seems superfluous these days. Remove it and add it to the switch() stmt like the others. [ Impact: cleanup ] Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ef5d8a5b245..663bbe01505 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3570,12 +3570,8 @@ perf_counter_alloc(struct perf_counter_attr *attr, if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (attr->type == PERF_TYPE_RAW) { - pmu = hw_perf_counter_init(counter); - goto done; - } - switch (attr->type) { + case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: case PERF_TYPE_HW_CACHE: pmu = hw_perf_counter_init(counter); -- cgit v1.2.3 From 974802eaa1afdc87e00821df7020a2b3c6fee623 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Jun 2009 12:46:55 +0200 Subject: perf_counter: Add forward/backward attribute ABI compatibility Provide for means of extending the perf_counter_attr in a 'natural' way. We allow growing the structure by appending fields at the end by specifying the full structure size inside it. When a new kernel sees a smaller (old) structure, it will 0 pad the tail. When an old kernel sees a larger (new) structure, it will verify the tail consists of 0s, otherwise fail. If we fail due to a size-mismatch, we return -E2BIG and write the kernel's native attribe size back into the provided structure. Furthermore, add some attribute verification, so that we'll fail counter creation when unknown bits are present (PERF_SAMPLE, PERF_FORMAT, or in the __reserved fields). (This ABI detail is introduced while keeping the existing syscall ABI.) Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 663bbe01505..29b685f551a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3584,6 +3584,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, case PERF_TYPE_TRACEPOINT: pmu = tp_perf_counter_init(counter); break; + + default: + break; } done: err = 0; @@ -3610,6 +3613,85 @@ done: return counter; } +static int perf_copy_attr(struct perf_counter_attr __user *uattr, + struct perf_counter_attr *attr) +{ + int ret; + u32 size; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0. + */ + if (size > sizeof(*attr)) { + unsigned long val; + unsigned long __user *addr; + unsigned long __user *end; + + addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), + sizeof(unsigned long)); + end = PTR_ALIGN((void __user *)uattr + size, + sizeof(unsigned long)); + + for (; addr < end; addr += sizeof(unsigned long)) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -3619,7 +3701,7 @@ done: * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_attr __user *, attr_uptr, + struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; @@ -3635,8 +3717,9 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) - return -EFAULT; + ret = perf_copy_attr(attr_uptr, &attr); + if (ret) + return ret; if (!attr.exclude_kernel) { if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) -- cgit v1.2.3 From b43e352139f51216a8c56b0bd5fc3d4e05c65619 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:00 -0600 Subject: sched: export kick_process lguest needs kick_process: wake_up_process() does nothing if a process is running, which isn't sufficient (we need it in the kernel). And lguest support is usually modular. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f04aa966450..8ec9d13140b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted -- cgit v1.2.3 From 1dc492a0a4470852cb451db1e00d580ce9fd7a28 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 22 Feb 2009 10:24:27 +0530 Subject: trivial: kernel/power/poweroff.c: whitespace fix Fix coding style whitespace fixes. Patch compile tested Before :- total: 1 errors, 0 warnings, 46 lines checked After total: 0 errors, 0 warnings, 46 lines checked Before :- text data bss dec hex filename 107 48 0 155 9b kernel/power/poweroff.o After text data bss dec hex filename 107 48 0 155 9b kernel/power/poweroff.o Signed-off-by: Manish Katiyar Signed-off-by: Jiri Kosina --- kernel/power/poweroff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 97890831e1b..e8b33700627 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "powerOff", .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, + .enable_mask = SYSRQ_ENABLE_BOOT, }; static int pm_sysrq_init(void) -- cgit v1.2.3 From 3ac49a1c9928b4a242b3cb1d83bc1d5c9b8fcb50 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 4 Jun 2009 16:20:28 +0200 Subject: trivial: fix ETIMEOUT -> ETIMEDOUT typos fix ETIMEOUT -> ETIMEDOUT typos Signed-off-by: Jean Delvare Signed-off-by: Jiri Kosina --- kernel/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 820c5af44f3..fcd107a78c5 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -902,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); * Returns: * 0 on success * -EINTR when interrupted by a signal - * -ETIMEOUT when the timeout expired + * -ETIMEDOUT when the timeout expired * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int -- cgit v1.2.3 From e39a71ef80877f4e30d808af9acceec80f4d2f7c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 15 May 2009 00:53:26 +0200 Subject: PM: Rename device_power_down/up() Rename the functions performing "_noirq" dev_pm_ops operations from device_power_down() and device_power_up() to device_suspend_noirq() and device_resume_noirq(). The new function names are chosen to show that the functions are responsible for calling the _noirq() versions to finalize the suspend/resume operation. The current function names do not perform power down/up anymore so the names may be misleading. Global function renames: - device_power_down() -> device_suspend_noirq() - device_power_up() -> device_resume_noirq() Static function renames: - suspend_device_noirq() -> __device_suspend_noirq() - resume_device_noirq() -> __device_resume_noirq() Signed-off-by: Magnus Damm Acked-by: Greg Kroah-Hartman Acked-by: Len Brown Signed-off-by: Rafael J. Wysocki --- kernel/kexec.c | 8 ++++---- kernel/power/disk.c | 16 ++++++++-------- kernel/power/main.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index e4983770913..5a3da87adae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1452,13 +1452,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, device_suspend() has been called, - * but *not* device_power_down(). We *must* - * device_power_down() now. Otherwise, drivers for + * but *not* device_suspend_noirq(). We *must* call + * device_suspend_noirq() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = device_suspend_noirq(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1486,7 +1486,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - device_power_up(PMSG_RESTORE); + device_resume_noirq(PMSG_RESTORE); Resume_devices: device_resume(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 5cb080e7eeb..1c18bc894a2 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -216,12 +216,12 @@ static int create_image(int platform_mode) return error; /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* call device_power_down() now. + * device_suspend_noirq(). We *must* call device_suspend_noirq() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) * become desynchronized with the actual state of the hardware * at resume time, and evil weirdness ensues. */ - error = device_power_down(PMSG_FREEZE); + error = device_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -262,7 +262,7 @@ static int create_image(int platform_mode) Power_up: sysdev_resume(); - /* NOTE: device_power_up() is just a resume() for devices + /* NOTE: device_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -275,7 +275,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - device_power_up(in_suspend ? + device_resume_noirq(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode) { int error; - error = device_power_down(PMSG_QUIESCE); + error = device_suspend_noirq(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - device_power_up(PMSG_RECOVER); + device_resume_noirq(PMSG_RECOVER); return error; } @@ -454,7 +454,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = device_power_down(PMSG_HIBERNATE); + error = device_suspend_noirq(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -479,7 +479,7 @@ int hibernation_platform_enter(void) Platofrm_finish: hibernation_ops->finish(); - device_power_up(PMSG_RESTORE); + device_suspend_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 868028280d1..2f6638ee03c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state) return error; } - error = device_power_down(PMSG_SUSPEND); + error = device_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platfrom_finish; @@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state) suspend_ops->wake(); Power_up_devices: - device_power_up(PMSG_RESUME); + device_resume_noirq(PMSG_RESUME); Platfrom_finish: if (suspend_ops->finish) -- cgit v1.2.3 From d161630297a20802d01c55847bfcba85d2118a9f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sun, 24 May 2009 22:05:42 +0200 Subject: PM core: rename suspend and resume functions This patch (as1241) renames a bunch of functions in the PM core. Rather than go through a boring list of name changes, suffice it to say that in the end we have a bunch of pairs of functions: device_resume_noirq dpm_resume_noirq device_resume dpm_resume device_complete dpm_complete device_suspend_noirq dpm_suspend_noirq device_suspend dpm_suspend device_prepare dpm_prepare in which device_X does the X operation on a single device and dpm_X invokes device_X for all devices in the dpm_list. In addition, the old dpm_power_up and device_resume_noirq have been combined into a single function (dpm_resume_noirq). Lastly, dpm_suspend_start and dpm_resume_end are the renamed versions of the former top-level device_suspend and device_resume routines. Signed-off-by: Alan Stern Acked-by: Magnus Damm Signed-off-by: Rafael J. Wysocki --- kernel/kexec.c | 14 +++++++------- kernel/power/disk.c | 30 +++++++++++++++--------------- kernel/power/main.c | 8 ++++---- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a3da87adae..ae1c35201cc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1448,17 +1448,17 @@ int kernel_kexec(void) goto Restore_console; } suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; - /* At this point, device_suspend() has been called, - * but *not* device_suspend_noirq(). We *must* call - * device_suspend_noirq() now. Otherwise, drivers for + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_noirq(). We *must* call + * dpm_suspend_noirq() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = device_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1486,9 +1486,9 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - device_resume_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 1c18bc894a2..a9beba68b6c 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,13 +215,13 @@ static int create_image(int platform_mode) if (error) return error; - /* At this point, device_suspend() has been called, but *not* - * device_suspend_noirq(). We *must* call device_suspend_noirq() now. + /* At this point, dpm_suspend_start() has been called, but *not* + * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) * become desynchronized with the actual state of the hardware * at resume time, and evil weirdness ensues. */ - error = device_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -262,7 +262,7 @@ static int create_image(int platform_mode) Power_up: sysdev_resume(); - /* NOTE: device_resume_noirq() is just a resume() for devices + /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -275,7 +275,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - device_resume_noirq(in_suspend ? + dpm_resume_noirq(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - error = device_suspend(PMSG_FREEZE); + error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: - device_resume(in_suspend ? + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); Close: @@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode) { int error; - error = device_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_noirq(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - device_resume_noirq(PMSG_RECOVER); + dpm_resume_noirq(PMSG_RECOVER); return error; } @@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_QUIESCE); + error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - device_resume(PMSG_RECOVER); + dpm_resume_end(PMSG_RECOVER); } resume_console(); pm_restore_console(); @@ -447,14 +447,14 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - error = device_suspend(PMSG_HIBERNATE); + error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } - error = device_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_noirq(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -479,11 +479,11 @@ int hibernation_platform_enter(void) Platofrm_finish: hibernation_ops->finish(); - device_suspend_noirq(PMSG_RESTORE); + dpm_suspend_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; - device_resume(PMSG_RESTORE); + dpm_resume_end(PMSG_RESTORE); resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 2f6638ee03c..46386b9f8dd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -295,7 +295,7 @@ static int suspend_enter(suspend_state_t state) return error; } - error = device_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platfrom_finish; @@ -335,7 +335,7 @@ static int suspend_enter(suspend_state_t state) suspend_ops->wake(); Power_up_devices: - device_resume_noirq(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); Platfrom_finish: if (suspend_ops->finish) @@ -363,7 +363,7 @@ int suspend_devices_and_enter(suspend_state_t state) } suspend_console(); suspend_test_start(); - error = device_suspend(PMSG_SUSPEND); + error = dpm_suspend_start(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); goto Recover_platform; @@ -376,7 +376,7 @@ int suspend_devices_and_enter(suspend_state_t state) Resume_devices: suspend_test_start(); - device_resume(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); resume_console(); Close: -- cgit v1.2.3 From c6f37f12197ac3bd2e5a35f2f0e195ae63d437de Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 24 May 2009 22:16:31 +0200 Subject: PM/Suspend: Do not shrink memory before suspend Remove the shrinking of memory from the suspend-to-RAM code, where it is not really necessary. Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Acked-by: Wu Fengguang --- kernel/power/main.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 46386b9f8dd..2a19f347bd8 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -188,9 +188,6 @@ static void suspend_test_finish(const char *label) #endif -/* This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) - static struct platform_suspend_ops *suspend_ops; /** @@ -226,7 +223,6 @@ int suspend_valid_only_mem(suspend_state_t state) static int suspend_prepare(void) { int error; - unsigned int free_pages; if (!suspend_ops || !suspend_ops->enter) return -EPERM; @@ -241,24 +237,10 @@ static int suspend_prepare(void) if (error) goto Finish; - if (suspend_freeze_processes()) { - error = -EAGAIN; - goto Thaw; - } - - free_pages = global_page_state(NR_FREE_PAGES); - if (free_pages < FREE_PAGE_NUMBER) { - pr_debug("PM: free some memory\n"); - shrink_all_memory(FREE_PAGE_NUMBER - free_pages); - if (nr_free_pages() < FREE_PAGE_NUMBER) { - error = -ENOMEM; - printk(KERN_ERR "PM: No enough memory\n"); - } - } + error = suspend_freeze_processes(); if (!error) return 0; - Thaw: suspend_thaw_processes(); usermodehelper_enable(); Finish: -- cgit v1.2.3 From fe419535d82724314bbf1244a0e740e4ea1bd3ae Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 11 Jun 2009 23:11:17 +0200 Subject: PM/Hibernate: Move memory shrinking to snapshot.c (rev. 2) A future patch is going to modify the memory shrinking code so that it will make memory allocations to free memory instead of using an artificial memory shrinking mechanism for that. For this purpose it is convenient to move swsusp_shrink_memory() from kernel/power/swsusp.c to kernel/power/snapshot.c, because the new memory-shrinking code is going to use things that are local to kernel/power/snapshot.c . [rev. 2: Make some functions static and remove their headers from kernel/power/power.h] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Acked-by: Wu Fengguang --- kernel/power/power.h | 4 +-- kernel/power/snapshot.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- kernel/power/swsusp.c | 76 ---------------------------------------------- 3 files changed, 79 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 46b5ec7a3af..ec4dbdfb07b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern unsigned int count_data_pages(void); +extern int swsusp_shrink_memory(void); /** * Auxiliary structure used for reading the snapshot image data and @@ -149,7 +149,6 @@ extern int swsusp_swap_in_use(void); /* kernel/power/disk.c */ extern int swsusp_check(void); -extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); @@ -176,7 +175,6 @@ extern int pm_notifier_call_chain(unsigned long val); #endif #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); int restore_highmem(void); #else static inline unsigned int count_highmem_pages(void) { return 0; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 33e2e4a819f..523a451b45d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size = 500 * 1024 * 1024; + /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written @@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) * pages. */ -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void) { struct zone *zone; unsigned int n = 0; @@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) * pages. */ -unsigned int count_data_pages(void) +static unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; @@ -1058,6 +1066,74 @@ void swsusp_free(void) buffer = NULL; } +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. + */ + +#define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} + +int swsusp_shrink_memory(void) +{ + long tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + struct timeval start, stop; + + printk(KERN_INFO "PM: Shrinking memory... "); + do_gettimeofday(&start); + do { + long size, highmem_size; + + highmem_size = count_highmem_pages(); + size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; + tmp = size; + size += highmem_size; + for_each_populated_zone(zone) { + tmp += snapshot_additional_pages(zone); + if (is_highmem(zone)) { + highmem_size -= + zone_page_state(zone, NR_FREE_PAGES); + } else { + tmp -= zone_page_state(zone, NR_FREE_PAGES); + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } + } + + if (highmem_size < 0) + highmem_size = 0; + + tmp += highmem_size; + if (tmp > 0) { + tmp = __shrink_memory(tmp); + if (!tmp) + return -ENOMEM; + pages += tmp; + } else if (size > image_size / PAGE_SIZE) { + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); + pages += tmp; + } + printk("\b%c", p[i++%4]); + } while (tmp > 0); + do_gettimeofday(&stop); + printk("\bdone (%lu pages freed)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Freed"); + + return 0; +} + #ifdef CONFIG_HIGHMEM /** * count_pages_for_highmem - compute the number of non-highmem pages diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 78c35047586..87b901cb392 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -55,14 +55,6 @@ #include "power.h" -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; - int in_suspend __nosavedata = 0; /** @@ -195,74 +187,6 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, kps / 1000, (kps % 1000) / 10); } -/** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. - */ - -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) -{ - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ - long tmp; - struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; - struct timeval start, stop; - - printk(KERN_INFO "PM: Shrinking memory... "); - do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_populated_zone(zone) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - - if (highmem_size < 0) - highmem_size = 0; - - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); - do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); - - return 0; -} - /* * Platforms, like ACPI, may want us to save some memory used by them during * hibernation and to restore the contents of this memory during the subsequent -- cgit v1.2.3 From a9d7052363a6e06bb623ed1876c56c7ca5b2c6d8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Jun 2009 01:27:12 +0200 Subject: PM: Separate suspend to RAM functionality from core Move the suspend to RAM and standby code from kernel/power/main.c to two separate files, kernel/power/suspend.c containing the basic functions and kernel/power/suspend_test.c containing the automatic suspend test facility based on the RTC clock alarm. There are no changes in functionality related to these modifications. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek --- kernel/power/Makefile | 2 + kernel/power/main.c | 503 -------------------------------------------- kernel/power/power.h | 17 +- kernel/power/suspend.c | 300 ++++++++++++++++++++++++++ kernel/power/suspend_test.c | 187 ++++++++++++++++ 5 files changed, 505 insertions(+), 504 deletions(-) create mode 100644 kernel/power/suspend.c create mode 100644 kernel/power/suspend_test.c (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 720ea4f781b..c4baf1b633c 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -6,6 +6,8 @@ endif obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/main.c b/kernel/power/main.c index 2a19f347bd8..f710e36930c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,20 +8,9 @@ * */ -#include -#include #include #include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include #include "power.h" @@ -119,355 +108,6 @@ power_attr(pm_test); #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_SUSPEND - -static int suspend_test(int level) -{ -#ifdef CONFIG_PM_DEBUG - if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); - return 1; - } -#endif /* !CONFIG_PM_DEBUG */ - return 0; -} - -#ifdef CONFIG_PM_TEST_SUSPEND - -/* - * We test the system suspend code by setting an RTC wakealarm a short - * time in the future, then suspending. Suspending the devices won't - * normally take long ... some systems only need a few milliseconds. - * - * The time it takes is system-specific though, so when we test this - * during system bootup we allow a LOT of time. - */ -#define TEST_SUSPEND_SECONDS 5 - -static unsigned long suspend_test_start_time; - -static void suspend_test_start(void) -{ - /* FIXME Use better timebase than "jiffies", ideally a clocksource. - * What we want is a hardware counter that will work correctly even - * during the irqs-are-off stages of the suspend/resume cycle... - */ - suspend_test_start_time = jiffies; -} - -static void suspend_test_finish(const char *label) -{ - long nj = jiffies - suspend_test_start_time; - unsigned msec; - - msec = jiffies_to_msecs(abs(nj)); - pr_info("PM: %s took %d.%03d seconds\n", label, - msec / 1000, msec % 1000); - - /* Warning on suspend means the RTC alarm period needs to be - * larger -- the system was sooo slooowwww to suspend that the - * alarm (should have) fired before the system went to sleep! - * - * Warning on either suspend or resume also means the system - * has some performance issues. The stack dump of a WARN_ON - * is more likely to get the right attention than a printk... - */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); -} - -#else - -static void suspend_test_start(void) -{ -} - -static void suspend_test_finish(const char *label) -{ -} - -#endif - -static struct platform_suspend_ops *suspend_ops; - -/** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. - */ - -void suspend_set_ops(struct platform_suspend_ops *ops) -{ - mutex_lock(&pm_mutex); - suspend_ops = ops; - mutex_unlock(&pm_mutex); -} - -/** - * suspend_valid_only_mem - generic memory-only valid callback - * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. - */ -int suspend_valid_only_mem(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} - -/** - * suspend_prepare - Do prep work before entering low-power state. - * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) -{ - int error; - - if (!suspend_ops || !suspend_ops->enter) - return -EPERM; - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) - goto Finish; - - error = usermodehelper_disable(); - if (error) - goto Finish; - - error = suspend_freeze_processes(); - if (!error) - return 0; - - suspend_thaw_processes(); - usermodehelper_enable(); - Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - return error; -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ - local_irq_disable(); -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) -{ - local_irq_enable(); -} - -/** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter - * - * This function should be called after devices have been suspended. - */ -static int suspend_enter(suspend_state_t state) -{ - int error; - - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - return error; - } - - error = dpm_suspend_noirq(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; - } - - if (suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Power_up_devices; - } - - if (suspend_test(TEST_PLATFORM)) - goto Platform_wake; - - error = disable_nonboot_cpus(); - if (error || suspend_test(TEST_CPUS)) - goto Enable_cpus; - - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - - error = sysdev_suspend(PMSG_SUSPEND); - if (!error) { - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); - sysdev_resume(); - } - - arch_suspend_enable_irqs(); - BUG_ON(irqs_disabled()); - - Enable_cpus: - enable_nonboot_cpus(); - - Platform_wake: - if (suspend_ops->wake) - suspend_ops->wake(); - - Power_up_devices: - dpm_resume_noirq(PMSG_RESUME); - - Platfrom_finish: - if (suspend_ops->finish) - suspend_ops->finish(); - - return error; -} - -/** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) -{ - int error; - - if (!suspend_ops) - return -ENOSYS; - - if (suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - suspend_test_start(); - error = dpm_suspend_start(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Recover_platform; - } - suspend_test_finish("suspend devices"); - if (suspend_test(TEST_DEVICES)) - goto Recover_platform; - - suspend_enter(state); - - Resume_devices: - suspend_test_start(); - dpm_resume_end(PMSG_RESUME); - suspend_test_finish("resume devices"); - resume_console(); - Close: - if (suspend_ops->end) - suspend_ops->end(); - return error; - - Recover_platform: - if (suspend_ops->recover) - suspend_ops->recover(); - goto Resume_devices; -} - -/** - * suspend_finish - Do final work before exiting suspend sequence. - * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) -{ - suspend_thaw_processes(); - usermodehelper_enable(); - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); -} - - - - -static const char * const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -}; - -static inline int valid_state(suspend_state_t state) -{ - /* All states need lowlevel support and need to be valid - * to the lowlevel implementation, no valid callback - * implies that none are valid. */ - if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state)) - return 0; - return 1; -} - - -/** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. - * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). - */ -static int enter_state(suspend_state_t state) -{ - int error; - - if (!valid_state(state)) - return -ENODEV; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); - if (error) - goto Unlock; - - if (suspend_test(TEST_FREEZER)) - goto Finish; - - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_devices_and_enter(state); - - Finish: - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. - * - * Determine whether or not value is within range, get state - * structure, and enter (above). - */ - -int pm_suspend(suspend_state_t state) -{ - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); - return -EINVAL; -} - -EXPORT_SYMBOL(pm_suspend); - -#endif /* CONFIG_SUSPEND */ - struct kobject *power_kobj; /** @@ -480,7 +120,6 @@ struct kobject *power_kobj; * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ - static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -578,7 +217,6 @@ static struct attribute_group attr_group = { .attrs = g, }; - static int __init pm_init(void) { power_kobj = kobject_create_and_add("power", NULL); @@ -588,144 +226,3 @@ static int __init pm_init(void) } core_initcall(pm_init); - - -#ifdef CONFIG_PM_TEST_SUSPEND - -#include - -/* - * To test system suspend, we need a hands-off mechanism to resume the - * system. RTCs wake alarms are a common self-contained mechanism. - */ - -static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) -{ - static char err_readtime[] __initdata = - KERN_ERR "PM: can't read %s time, err %d\n"; - static char err_wakealarm [] __initdata = - KERN_ERR "PM: can't set %s wakealarm, err %d\n"; - static char err_suspend[] __initdata = - KERN_ERR "PM: suspend test failed, error %d\n"; - static char info_test[] __initdata = - KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - - unsigned long now; - struct rtc_wkalrm alm; - int status; - - /* this may fail if the RTC hasn't been initialized */ - status = rtc_read_time(rtc, &alm.time); - if (status < 0) { - printk(err_readtime, dev_name(&rtc->dev), status); - return; - } - rtc_tm_to_time(&alm.time, &now); - - memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); - alm.enabled = true; - - status = rtc_set_alarm(rtc, &alm); - if (status < 0) { - printk(err_wakealarm, dev_name(&rtc->dev), status); - return; - } - - if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - if (status == -ENODEV) - state = PM_SUSPEND_STANDBY; - } - if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - } - if (status < 0) - printk(err_suspend, status); - - /* Some platforms can't detect that the alarm triggered the - * wakeup, or (accordingly) disable it after it afterwards. - * It's supposed to give oneshot behavior; cope. - */ - alm.enabled = false; - rtc_set_alarm(rtc, &alm); -} - -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - -/* - * Kernel options like "test_suspend=mem" force suspend/resume sanity tests - * at startup time. They're normally disabled, for faster boot and because - * we can't know which states really work on this particular system. - */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; - -static char warn_bad_state[] __initdata = - KERN_WARNING "PM: can't test '%s' suspend state\n"; - -static int __init setup_test_suspend(char *value) -{ - unsigned i; - - /* "=mem" ==> "mem" */ - value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } - printk(warn_bad_state, value); - return 0; -} -__setup("test_suspend", setup_test_suspend); - -static int __init test_suspend(void) -{ - static char warn_no_rtc[] __initdata = - KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - - char *pony = NULL; - struct rtc_device *rtc = NULL; - - /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); - goto done; - } - - /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); - if (!rtc) { - printk(warn_no_rtc); - goto done; - } - - /* go for it */ - test_wakealarm(rtc, test_state); - rtc_class_close(rtc); -done: - return 0; -} -late_initcall(test_suspend); - -#endif /* CONFIG_PM_TEST_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index ec4dbdfb07b..2bd98d9fc19 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -160,15 +160,30 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND -/* kernel/power/main.c */ +/* kernel/power/suspend.c */ +extern const char *const pm_states[]; + +extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); +extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } +static inline int enter_state(suspend_state_t state) { return -ENOSYS; } +static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_notifier_call_chain(unsigned long val); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 00000000000..6f10dfc2d3e --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,300 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "power.h" + +const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; + +static struct platform_suspend_ops *suspend_ops; + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Pointer to ops structure. + */ +void suspend_set_ops(struct platform_suspend_ops *ops) +{ + mutex_lock(&pm_mutex); + suspend_ops = ops; + mutex_unlock(&pm_mutex); +} + +bool valid_state(suspend_state_t state) +{ + /* + * All states need lowlevel support and need to be valid to the lowlevel + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/** + * suspend_valid_only_mem - generic memory-only valid callback + * + * Platform drivers that implement mem suspend only and only need + * to check for that in their .valid callback can use this instead + * of rolling their own .valid callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); + mdelay(5000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Do prep work before entering low-power state. + * + * This is common code that is called for each state that we're entering. + * Run suspend notifiers, allocate a console and stop all processes. + */ +static int suspend_prepare(void) +{ + int error; + + if (!suspend_ops || !suspend_ops->enter) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + + error = usermodehelper_disable(); + if (error) + goto Finish; + + error = suspend_freeze_processes(); + if (!error) + return 0; + + suspend_thaw_processes(); + usermodehelper_enable(); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + return error; +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - enter the desired system sleep state. + * @state: state to enter + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state) +{ + int error; + + if (suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + return error; + } + + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down\n"); + goto Platfrom_finish; + } + + if (suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); + if (error) + goto Power_up_devices; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + error = sysdev_suspend(PMSG_SUSPEND); + if (!error) { + if (!suspend_test(TEST_CORE)) + error = suspend_ops->enter(state); + sysdev_resume(); + } + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_wake: + if (suspend_ops->wake) + suspend_ops->wake(); + + Power_up_devices: + dpm_resume_noirq(PMSG_RESUME); + + Platfrom_finish: + if (suspend_ops->finish) + suspend_ops->finish(); + + return error; +} + +/** + * suspend_devices_and_enter - suspend devices and enter the desired system + * sleep state. + * @state: state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + + if (!suspend_ops) + return -ENOSYS; + + if (suspend_ops->begin) { + error = suspend_ops->begin(state); + if (error) + goto Close; + } + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to suspend\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + suspend_enter(state); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + resume_console(); + Close: + if (suspend_ops->end) + suspend_ops->end(); + return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; +} + +/** + * suspend_finish - Do final work before exiting suspend sequence. + * + * Call platform code to clean up, restart processes, and free the + * console that we've allocated. This is not called for suspend-to-disk. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + usermodehelper_enable(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work of entering low-power state. + * @state: pm_state structure for state we're entering. + * + * Make sure we're the only ones trying to enter a sleep state. Fail + * if someone has beat us to it, since we don't want anything weird to + * happen when we wake up. + * Then, do the setup for suspend, enter the state, and cleaup (after + * we've woken up). + */ +int enter_state(suspend_state_t state) +{ + int error; + + if (!valid_state(state)) + return -ENODEV; + + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + error = suspend_prepare(); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + pr_debug("PM: Entering %s sleep\n", pm_states[state]); + error = suspend_devices_and_enter(state); + + Finish: + pr_debug("PM: Finishing wakeup.\n"); + suspend_finish(); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + +/** + * pm_suspend - Externally visible function for suspending system. + * @state: Enumerated value of state to enter. + * + * Determine whether or not value is within range, get state + * structure, and enter (above). + */ +int pm_suspend(suspend_state_t state) +{ + if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) + return enter_state(state); + return -EINVAL; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 00000000000..17d8bb1acf9 --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,187 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek + * + * This file is released under the GPLv2. + */ + +#include +#include + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 5 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); -- cgit v1.2.3 From 8b759b84c8b3c27ccc8dd787294636297b3ebb40 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Jun 2009 01:27:49 +0200 Subject: PM/Hibernate: Rename disk.c to hibernate.c Change the name of kernel/power/disk.c to kernel/power/hibernate.c in analogy with the file names introduced by the changes that separated the suspend to RAM and standby funtionality from the common PM functions. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek --- kernel/power/Makefile | 2 +- kernel/power/disk.c | 955 ----------------------------------------------- kernel/power/hibernate.c | 955 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/power/power.h | 4 +- 4 files changed, 958 insertions(+), 958 deletions(-) delete mode 100644 kernel/power/disk.c create mode 100644 kernel/power/hibernate.c (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c4baf1b633c..eadb17fc8f5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -8,6 +8,6 @@ obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c deleted file mode 100644 index a9beba68b6c..00000000000 --- a/kernel/power/disk.c +++ /dev/null @@ -1,955 +0,0 @@ -/* - * kernel/power/disk.c - Suspend-to-disk support. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - - -static int noresume = 0; -static char resume_file[256] = CONFIG_PM_STD_PARTITION; -dev_t swsusp_resume_device; -sector_t swsusp_resume_block; - -enum { - HIBERNATION_INVALID, - HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, - HIBERNATION_SHUTDOWN, - HIBERNATION_REBOOT, - /* keep last */ - __HIBERNATION_AFTER_LAST -}; -#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) -#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) - -static int hibernation_mode = HIBERNATION_SHUTDOWN; - -static struct platform_hibernation_ops *hibernation_ops; - -/** - * hibernation_set_ops - set the global hibernate operations - * @ops: the hibernation operations to use in subsequent hibernation transitions - */ - -void hibernation_set_ops(struct platform_hibernation_ops *ops) -{ - if (ops && !(ops->begin && ops->end && ops->pre_snapshot - && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { - WARN_ON(1); - return; - } - mutex_lock(&pm_mutex); - hibernation_ops = ops; - if (ops) - hibernation_mode = HIBERNATION_PLATFORM; - else if (hibernation_mode == HIBERNATION_PLATFORM) - hibernation_mode = HIBERNATION_SHUTDOWN; - - mutex_unlock(&pm_mutex); -} - -static bool entering_platform_hibernation; - -bool system_entering_hibernation(void) -{ - return entering_platform_hibernation; -} -EXPORT_SYMBOL(system_entering_hibernation); - -#ifdef CONFIG_PM_DEBUG -static void hibernation_debug_sleep(void) -{ - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); - mdelay(5000); -} - -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - -static int hibernation_test(int level) -{ - if (pm_test_level == level) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} -#else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } -static int hibernation_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ - -/** - * platform_begin - tell the platform driver that we're starting - * hibernation - */ - -static int platform_begin(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; -} - -/** - * platform_end - tell the platform driver that we've entered the - * working state - */ - -static void platform_end(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->end(); -} - -/** - * platform_pre_snapshot - prepare the machine for hibernation using the - * platform driver if so configured and return an error code if it fails - */ - -static int platform_pre_snapshot(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_snapshot() : 0; -} - -/** - * platform_leave - prepare the machine for switching to the normal mode - * of operation using the platform driver (called with interrupts disabled) - */ - -static void platform_leave(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->leave(); -} - -/** - * platform_finish - switch the machine to the normal mode of operation - * using the platform driver (must be called after platform_prepare()) - */ - -static void platform_finish(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->finish(); -} - -/** - * platform_pre_restore - prepare the platform for the restoration from a - * hibernation image. If the restore fails after this function has been - * called, platform_restore_cleanup() must be called. - */ - -static int platform_pre_restore(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_restore() : 0; -} - -/** - * platform_restore_cleanup - switch the platform to the normal mode of - * operation after a failing restore. If platform_pre_restore() has been - * called before the failing restore, this function must be called too, - * regardless of the result of platform_pre_restore(). - */ - -static void platform_restore_cleanup(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->restore_cleanup(); -} - -/** - * platform_recover - recover the platform from a failure to suspend - * devices. - */ - -static void platform_recover(int platform_mode) -{ - if (platform_mode && hibernation_ops && hibernation_ops->recover) - hibernation_ops->recover(); -} - -/** - * create_image - freeze devices that need to be frozen with interrupts - * off, create the hibernation image and thaw those devices. Control - * reappears in this routine after a restore. - */ - -static int create_image(int platform_mode) -{ - int error; - - error = arch_prepare_suspend(); - if (error) - return error; - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_noirq(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); - return error; - } - - error = platform_pre_snapshot(platform_mode); - if (error || hibernation_test(TEST_PLATFORM)) - goto Platform_finish; - - error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS) - || hibernation_testmode(HIBERNATION_TEST)) - goto Enable_cpus; - - local_irq_disable(); - - error = sysdev_suspend(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some system devices failed to power down, " - "aborting hibernation\n"); - goto Enable_irqs; - } - - if (hibernation_test(TEST_CORE)) - goto Power_up; - - in_suspend = 1; - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); - /* Restore control flow magically appears here */ - restore_processor_state(); - if (!in_suspend) - platform_leave(platform_mode); - - Power_up: - sysdev_resume(); - /* NOTE: dpm_resume_noirq() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ - - Enable_irqs: - local_irq_enable(); - - Enable_cpus: - enable_nonboot_cpus(); - - Platform_finish: - platform_finish(platform_mode); - - dpm_resume_noirq(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - - return error; -} - -/** - * hibernation_snapshot - quiesce devices and create the hibernation - * snapshot image. - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the power transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_snapshot(int platform_mode) -{ - int error; - - error = platform_begin(platform_mode); - if (error) - return error; - - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - goto Close; - - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Recover_platform; - - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; - - error = create_image(platform_mode); - /* Control returns here after successful restore */ - - Resume_devices: - dpm_resume_end(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - resume_console(); - Close: - platform_end(platform_mode); - return error; - - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; -} - -/** - * resume_target_kernel - prepare devices that need to be suspended with - * interrupts off, restore the contents of highmem that have not been - * restored yet from the image and run the low level code that will restore - * the remaining contents of memory and switch to the just restored target - * kernel. - */ - -static int resume_target_kernel(bool platform_mode) -{ - int error; - - error = dpm_suspend_noirq(PMSG_QUIESCE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); - return error; - } - - error = platform_pre_restore(platform_mode); - if (error) - goto Cleanup; - - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - - local_irq_disable(); - - error = sysdev_suspend(PMSG_QUIESCE); - if (error) - goto Enable_irqs; - - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - error = restore_highmem(); - if (!error) { - error = swsusp_arch_resume(); - /* - * The code below is only ever reached in case of a failure. - * Otherwise execution continues at place where - * swsusp_arch_suspend() was called - */ - BUG_ON(!error); - /* This call to restore_highmem() undos the previous one */ - restore_highmem(); - } - /* - * The only reason why swsusp_arch_resume() can fail is memory being - * very tight, so we have to free it as soon as we can to avoid - * subsequent failures - */ - swsusp_free(); - restore_processor_state(); - touch_softlockup_watchdog(); - - sysdev_resume(); - - Enable_irqs: - local_irq_enable(); - - Enable_cpus: - enable_nonboot_cpus(); - - Cleanup: - platform_restore_cleanup(platform_mode); - - dpm_resume_noirq(PMSG_RECOVER); - - return error; -} - -/** - * hibernation_restore - quiesce devices and restore the hibernation - * snapshot image. If successful, control returns in hibernation_snaphot() - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_restore(int platform_mode) -{ - int error; - - pm_prepare_console(); - suspend_console(); - error = dpm_suspend_start(PMSG_QUIESCE); - if (!error) { - error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); - } - resume_console(); - pm_restore_console(); - return error; -} - -/** - * hibernation_platform_enter - enter the hibernation state using the - * platform driver (if available) - */ - -int hibernation_platform_enter(void) -{ - int error; - - if (!hibernation_ops) - return -ENOSYS; - - /* - * We have cancelled the power transition by running - * hibernation_ops->finish() before saving the image, so we should let - * the firmware know that we're going to enter the sleep state after all - */ - error = hibernation_ops->begin(); - if (error) - goto Close; - - entering_platform_hibernation = true; - suspend_console(); - error = dpm_suspend_start(PMSG_HIBERNATE); - if (error) { - if (hibernation_ops->recover) - hibernation_ops->recover(); - goto Resume_devices; - } - - error = dpm_suspend_noirq(PMSG_HIBERNATE); - if (error) - goto Resume_devices; - - error = hibernation_ops->prepare(); - if (error) - goto Platofrm_finish; - - error = disable_nonboot_cpus(); - if (error) - goto Platofrm_finish; - - local_irq_disable(); - sysdev_suspend(PMSG_HIBERNATE); - hibernation_ops->enter(); - /* We should never get here */ - while (1); - - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ - Platofrm_finish: - hibernation_ops->finish(); - - dpm_suspend_noirq(PMSG_RESTORE); - - Resume_devices: - entering_platform_hibernation = false; - dpm_resume_end(PMSG_RESTORE); - resume_console(); - - Close: - hibernation_ops->end(); - - return error; -} - -/** - * power_down - Shut the machine down for hibernation. - * - * Use the platform driver, if configured so; otherwise try - * to power off or reboot. - */ - -static void power_down(void) -{ - switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_REBOOT: - kernel_restart(NULL); - break; - case HIBERNATION_PLATFORM: - hibernation_platform_enter(); - case HIBERNATION_SHUTDOWN: - kernel_power_off(); - break; - } - kernel_halt(); - /* - * Valid image is on the disk, if we continue we risk serious data - * corruption after resume. - */ - printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); -} - -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - -/** - * hibernate - The granpappy of the built-in hibernation management - */ - -int hibernate(void) -{ - int error; - - mutex_lock(&pm_mutex); - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - goto Exit; - - error = usermodehelper_disable(); - if (error) - goto Exit; - - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = prepare_processes(); - if (error) - goto Finish; - - if (hibernation_test(TEST_FREEZER)) - goto Thaw; - - if (hibernation_testmode(HIBERNATION_TESTPROC)) - goto Thaw; - - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { - unsigned int flags = 0; - - if (hibernation_mode == HIBERNATION_PLATFORM) - flags |= SF_PLATFORM_MODE; - pr_debug("PM: writing image.\n"); - error = swsusp_write(flags); - swsusp_free(); - if (!error) - power_down(); - } else { - pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); - } - Thaw: - thaw_processes(); - Finish: - free_basic_memory_bitmaps(); - usermodehelper_enable(); - Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * software_resume - Resume from a saved image. - * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in hibernate() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. - * - */ - -static int software_resume(void) -{ - int error; - unsigned int flags; - - /* - * If the user said "noresume".. bail out early. - */ - if (noresume) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); - - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } - - pr_debug("PM: Checking image partition %s\n", resume_file); - - /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; - goto Unlock; - } - } - - Check_image: - pr_debug("PM: Resume from partition %d:%d\n", - MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - - pr_debug("PM: Checking hibernation image.\n"); - error = swsusp_check(); - if (error) - goto Unlock; - - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - goto Finish; - - error = usermodehelper_disable(); - if (error) - goto Finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto Finish; - - pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); - if (error) { - swsusp_close(FMODE_READ); - goto Done; - } - - pr_debug("PM: Reading hibernation image.\n"); - - error = swsusp_read(&flags); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Restore failed, recovering.\n"); - swsusp_free(); - thaw_processes(); - Done: - free_basic_memory_bitmaps(); - usermodehelper_enable(); - Finish: - pm_notifier_call_chain(PM_POST_RESTORE); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - /* For success case, the suspend path will release the lock */ - Unlock: - mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); - return error; -} - -late_initcall(software_resume); - - -static const char * const hibernation_modes[] = { - [HIBERNATION_PLATFORM] = "platform", - [HIBERNATION_SHUTDOWN] = "shutdown", - [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", -}; - -/** - * disk - Control hibernation mode - * - * Suspend-to-disk can be handled in several ways. We have a few options - * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other hibernation_ops), powering off the system or rebooting the - * system (for testing) as well as the two test modes. - * - * The system can support 'platform', and that is known a priori (and - * encoded by the presence of hibernation_ops). However, the user may - * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the - * test modes, 'test' or 'testproc'. - * - * show() will display what the mode is currently set to. - * store() will accept one of - * - * 'platform' - * 'shutdown' - * 'reboot' - * 'test' - * 'testproc' - * - * It will only change to 'platform' if the system - * supports it (as determined by having hibernation_ops). - */ - -static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - int i; - char *start = buf; - - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!hibernation_modes[i]) - continue; - switch (i) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - break; - /* not a valid mode, continue with loop */ - continue; - } - if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); - else - buf += sprintf(buf, "%s ", hibernation_modes[i]); - } - buf += sprintf(buf, "\n"); - return buf-start; -} - - -static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int error = 0; - int i; - int len; - char *p; - int mode = HIBERNATION_INVALID; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - mutex_lock(&pm_mutex); - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (len == strlen(hibernation_modes[i]) - && !strncmp(buf, hibernation_modes[i], len)) { - mode = i; - break; - } - } - if (mode != HIBERNATION_INVALID) { - switch (mode) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - hibernation_mode = mode; - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - hibernation_mode = mode; - else - error = -EINVAL; - } - } else - error = -EINVAL; - - if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", - hibernation_modes[mode]); - mutex_unlock(&pm_mutex); - return error ? error : n; -} - -power_attr(disk); - -static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); -} - -static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; - - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; - - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; - - mutex_lock(&pm_mutex); - swsusp_resume_device = res; - mutex_unlock(&pm_mutex); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); - noresume = 0; - software_resume(); - ret = n; - out: - return ret; -} - -power_attr(resume); - -static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", image_size); -} - -static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - image_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(image_size); - -static struct attribute * g[] = { - &disk_attr.attr, - &resume_attr.attr, - &image_size_attr.attr, - NULL, -}; - - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_disk_init(void) -{ - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_disk_init); - - -static int __init resume_setup(char *str) -{ - if (noresume) - return 1; - - strncpy( resume_file, str, 255 ); - return 1; -} - -static int __init resume_offset_setup(char *str) -{ - unsigned long long offset; - - if (noresume) - return 1; - - if (sscanf(str, "%llu", &offset) == 1) - swsusp_resume_block = offset; - - return 1; -} - -static int __init noresume_setup(char *str) -{ - noresume = 1; - return 1; -} - -__setup("noresume", noresume_setup); -__setup("resume_offset=", resume_offset_setup); -__setup("resume=", resume_setup); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c new file mode 100644 index 00000000000..81d2e746489 --- /dev/null +++ b/kernel/power/hibernate.c @@ -0,0 +1,955 @@ +/* + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "power.h" + + +static int noresume = 0; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; +sector_t swsusp_resume_block; + +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_TEST, + HIBERNATION_TESTPROC, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +static struct platform_hibernation_ops *hibernation_ops; + +/** + * hibernation_set_ops - set the global hibernate operations + * @ops: the hibernation operations to use in subsequent hibernation transitions + */ + +void hibernation_set_ops(struct platform_hibernation_ops *ops) +{ + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore + && ops->restore_cleanup)) { + WARN_ON(1); + return; + } + mutex_lock(&pm_mutex); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + mutex_unlock(&pm_mutex); +} + +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + +#ifdef CONFIG_PM_DEBUG +static void hibernation_debug_sleep(void) +{ + printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); + mdelay(5000); +} + +static int hibernation_testmode(int mode) +{ + if (hibernation_mode == mode) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} + +static int hibernation_test(int level) +{ + if (pm_test_level == level) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} +#else /* !CONFIG_PM_DEBUG */ +static int hibernation_testmode(int mode) { return 0; } +static int hibernation_test(int level) { return 0; } +#endif /* !CONFIG_PM_DEBUG */ + +/** + * platform_begin - tell the platform driver that we're starting + * hibernation + */ + +static int platform_begin(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin() : 0; +} + +/** + * platform_end - tell the platform driver that we've entered the + * working state + */ + +static void platform_end(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->end(); +} + +/** + * platform_pre_snapshot - prepare the machine for hibernation using the + * platform driver if so configured and return an error code if it fails + */ + +static int platform_pre_snapshot(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; +} + +/** + * platform_leave - prepare the machine for switching to the normal mode + * of operation using the platform driver (called with interrupts disabled) + */ + +static void platform_leave(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); +} + +/** + * platform_finish - switch the machine to the normal mode of operation + * using the platform driver (must be called after platform_prepare()) + */ + +static void platform_finish(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); +} + +/** + * platform_pre_restore - prepare the platform for the restoration from a + * hibernation image. If the restore fails after this function has been + * called, platform_restore_cleanup() must be called. + */ + +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - switch the platform to the normal mode of + * operation after a failing restore. If platform_pre_restore() has been + * called before the failing restore, this function must be called too, + * regardless of the result of platform_pre_restore(). + */ + +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** + * create_image - freeze devices that need to be frozen with interrupts + * off, create the hibernation image and thaw those devices. Control + * reappears in this routine after a restore. + */ + +static int create_image(int platform_mode) +{ + int error; + + error = arch_prepare_suspend(); + if (error) + return error; + + /* At this point, dpm_suspend_start() has been called, but *not* + * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. + * Otherwise, drivers for some devices (e.g. interrupt controllers) + * become desynchronized with the actual state of the hardware + * at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_noirq(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + return error; + } + + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error || hibernation_test(TEST_CPUS) + || hibernation_testmode(HIBERNATION_TEST)) + goto Enable_cpus; + + local_irq_disable(); + + error = sysdev_suspend(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some system devices failed to power down, " + "aborting hibernation\n"); + goto Enable_irqs; + } + + if (hibernation_test(TEST_CORE)) + goto Power_up; + + in_suspend = 1; + save_processor_state(); + error = swsusp_arch_suspend(); + if (error) + printk(KERN_ERR "PM: Error %d creating hibernation image\n", + error); + /* Restore control flow magically appears here */ + restore_processor_state(); + if (!in_suspend) + platform_leave(platform_mode); + + Power_up: + sysdev_resume(); + /* NOTE: dpm_resume_noirq() is just a resume() for devices + * that suspended with irqs off ... no overall powerup. + */ + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_finish: + platform_finish(platform_mode); + + dpm_resume_noirq(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + return error; +} + +/** + * hibernation_snapshot - quiesce devices and create the hibernation + * snapshot image. + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform firmware for the power transition. + * + * Must be called with pm_mutex held + */ + +int hibernation_snapshot(int platform_mode) +{ + int error; + + error = platform_begin(platform_mode); + if (error) + return error; + + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Close; + + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Recover_platform; + + if (hibernation_test(TEST_DEVICES)) + goto Recover_platform; + + error = create_image(platform_mode); + /* Control returns here after successful restore */ + + Resume_devices: + dpm_resume_end(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + resume_console(); + Close: + platform_end(platform_mode); + return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; +} + +/** + * resume_target_kernel - prepare devices that need to be suspended with + * interrupts off, restore the contents of highmem that have not been + * restored yet from the image and run the low level code that will restore + * the remaining contents of memory and switch to the just restored target + * kernel. + */ + +static int resume_target_kernel(bool platform_mode) +{ + int error; + + error = dpm_suspend_noirq(PMSG_QUIESCE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting resume\n"); + return error; + } + + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + + error = sysdev_suspend(PMSG_QUIESCE); + if (error) + goto Enable_irqs; + + /* We'll ignore saved state, but this gets preempt count (etc) right */ + save_processor_state(); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* + * The code below is only ever reached in case of a failure. + * Otherwise execution continues at place where + * swsusp_arch_suspend() was called + */ + BUG_ON(!error); + /* This call to restore_highmem() undos the previous one */ + restore_highmem(); + } + /* + * The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures + */ + swsusp_free(); + restore_processor_state(); + touch_softlockup_watchdog(); + + sysdev_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + + dpm_resume_noirq(PMSG_RECOVER); + + return error; +} + +/** + * hibernation_restore - quiesce devices and restore the hibernation + * snapshot image. If successful, control returns in hibernation_snaphot() + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform firmware for the transition. + * + * Must be called with pm_mutex held + */ + +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { + error = resume_target_kernel(platform_mode); + dpm_resume_end(PMSG_RECOVER); + } + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - enter the hibernation state using the + * platform driver (if available) + */ + +int hibernation_platform_enter(void) +{ + int error; + + if (!hibernation_ops) + return -ENOSYS; + + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we should let + * the firmware know that we're going to enter the sleep state after all + */ + error = hibernation_ops->begin(); + if (error) + goto Close; + + entering_platform_hibernation = true; + suspend_console(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } + + error = dpm_suspend_noirq(PMSG_HIBERNATE); + if (error) + goto Resume_devices; + + error = hibernation_ops->prepare(); + if (error) + goto Platofrm_finish; + + error = disable_nonboot_cpus(); + if (error) + goto Platofrm_finish; + + local_irq_disable(); + sysdev_suspend(PMSG_HIBERNATE); + hibernation_ops->enter(); + /* We should never get here */ + while (1); + + /* + * We don't need to reenable the nonboot CPUs or resume consoles, since + * the system is going to be halted anyway. + */ + Platofrm_finish: + hibernation_ops->finish(); + + dpm_suspend_noirq(PMSG_RESTORE); + + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); + resume_console(); + + Close: + hibernation_ops->end(); + + return error; +} + +/** + * power_down - Shut the machine down for hibernation. + * + * Use the platform driver, if configured so; otherwise try + * to power off or reboot. + */ + +static void power_down(void) +{ + switch (hibernation_mode) { + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + break; + case HIBERNATION_REBOOT: + kernel_restart(NULL); + break; + case HIBERNATION_PLATFORM: + hibernation_platform_enter(); + case HIBERNATION_SHUTDOWN: + kernel_power_off(); + break; + } + kernel_halt(); + /* + * Valid image is on the disk, if we continue we risk serious data + * corruption after resume. + */ + printk(KERN_CRIT "PM: Please power down manually\n"); + while(1); +} + +static int prepare_processes(void) +{ + int error = 0; + + if (freeze_processes()) { + error = -EBUSY; + thaw_processes(); + } + return error; +} + +/** + * hibernate - The granpappy of the built-in hibernation management + */ + +int hibernate(void) +{ + int error; + + mutex_lock(&pm_mutex); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; + + error = usermodehelper_disable(); + if (error) + goto Exit; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Exit; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = prepare_processes(); + if (error) + goto Finish; + + if (hibernation_test(TEST_FREEZER)) + goto Thaw; + + if (hibernation_testmode(HIBERNATION_TESTPROC)) + goto Thaw; + + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (in_suspend && !error) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; + pr_debug("PM: writing image.\n"); + error = swsusp_write(flags); + swsusp_free(); + if (!error) + power_down(); + } else { + pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); + } + Thaw: + thaw_processes(); + Finish: + free_basic_memory_bitmaps(); + usermodehelper_enable(); + Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + + +/** + * software_resume - Resume from a saved image. + * + * Called as a late_initcall (so all devices are discovered and + * initialized), we call swsusp to see if we have a saved image or not. + * If so, we quiesce devices, the restore the saved image. We will + * return above (in hibernate() ) if everything goes well. + * Otherwise, we fail gracefully and return to the normally + * scheduled program. + * + */ + +static int software_resume(void) +{ + int error; + unsigned int flags; + + /* + * If the user said "noresume".. bail out early. + */ + if (noresume) + return 0; + + /* + * name_to_dev_t() below takes a sysfs buffer mutex when sysfs + * is configured into the kernel. Since the regular hibernate + * trigger path is via sysfs which takes a buffer mutex before + * calling hibernate functions (which take pm_mutex) this can + * cause lockdep to complain about a possible ABBA deadlock + * which cannot happen since we're in the boot code here and + * sysfs can't be invoked yet. Therefore, we use a subclass + * here to avoid lockdep complaining. + */ + mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + + if (swsusp_resume_device) + goto Check_image; + + if (!strlen(resume_file)) { + error = -ENOENT; + goto Unlock; + } + + pr_debug("PM: Checking image partition %s\n", resume_file); + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + /* + * Some device discovery might still be in progress; we need + * to wait for this to finish. + */ + wait_for_device_probe(); + /* + * We can't depend on SCSI devices being available after loading + * one of their modules until scsi_complete_async_scans() is + * called and the resume device usually is a SCSI one. + */ + scsi_complete_async_scans(); + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + error = -ENODEV; + goto Unlock; + } + } + + Check_image: + pr_debug("PM: Resume from partition %d:%d\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + + pr_debug("PM: Checking hibernation image.\n"); + error = swsusp_check(); + if (error) + goto Unlock; + + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (error) + goto Finish; + + error = usermodehelper_disable(); + if (error) + goto Finish; + + error = create_basic_memory_bitmaps(); + if (error) + goto Finish; + + pr_debug("PM: Preparing processes for restore.\n"); + error = prepare_processes(); + if (error) { + swsusp_close(FMODE_READ); + goto Done; + } + + pr_debug("PM: Reading hibernation image.\n"); + + error = swsusp_read(&flags); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Restore failed, recovering.\n"); + swsusp_free(); + thaw_processes(); + Done: + free_basic_memory_bitmaps(); + usermodehelper_enable(); + Finish: + pm_notifier_call_chain(PM_POST_RESTORE); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + /* For success case, the suspend path will release the lock */ + Unlock: + mutex_unlock(&pm_mutex); + pr_debug("PM: Resume from disk failed.\n"); + return error; +} + +late_initcall(software_resume); + + +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", + [HIBERNATION_TEST] = "test", + [HIBERNATION_TESTPROC] = "testproc", +}; + +/** + * disk - Control hibernation mode + * + * Suspend-to-disk can be handled in several ways. We have a few options + * for putting the system to sleep - using the platform driver (e.g. ACPI + * or other hibernation_ops), powering off the system or rebooting the + * system (for testing) as well as the two test modes. + * + * The system can support 'platform', and that is known a priori (and + * encoded by the presence of hibernation_ops). However, the user may + * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the + * test modes, 'test' or 'testproc'. + * + * show() will display what the mode is currently set to. + * store() will accept one of + * + * 'platform' + * 'shutdown' + * 'reboot' + * 'test' + * 'testproc' + * + * It will only change to 'platform' if the system + * supports it (as determined by having hibernation_ops). + */ + +static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) + continue; + switch (i) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + else + buf += sprintf(buf, "%s ", hibernation_modes[i]); + } + buf += sprintf(buf, "\n"); + return buf-start; +} + + +static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = 0; + int i; + int len; + char *p; + int mode = HIBERNATION_INVALID; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + mutex_lock(&pm_mutex); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { + mode = i; + break; + } + } + if (mode != HIBERNATION_INVALID) { + switch (mode) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + hibernation_mode = mode; + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + if (!error) + pr_debug("PM: Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + mutex_unlock(&pm_mutex); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int maj, min; + dev_t res; + int ret = -EINVAL; + + if (sscanf(buf, "%u:%u", &maj, &min) != 2) + goto out; + + res = MKDEV(maj,min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto out; + + mutex_lock(&pm_mutex); + swsusp_resume_device = res; + mutex_unlock(&pm_mutex); + printk(KERN_INFO "PM: Starting manual resume from disk\n"); + noresume = 0; + software_resume(); + ret = n; + out: + return ret; +} + +power_attr(resume); + +static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", image_size); +} + +static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + +static struct attribute * g[] = { + &disk_attr.attr, + &resume_attr.attr, + &image_size_attr.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy( resume_file, str, 255 ); + return 1; +} + +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +__setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); +__setup("resume=", resume_setup); diff --git a/kernel/power/power.h b/kernel/power/power.h index 2bd98d9fc19..26d5a26f82e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); @@ -147,7 +147,7 @@ extern int swsusp_swap_in_use(void); */ #define SF_PLATFORM_MODE 1 -/* kernel/power/disk.c */ +/* kernel/power/hibernate.c */ extern int swsusp_check(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); -- cgit v1.2.3 From fce2b111fae9151a53dabb36513b398d03337a19 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 10 Jun 2009 01:28:19 +0200 Subject: PM/Hibernate: Move NVS routines into a seperate file (v2). The *_nvs_* routines in swsusp.c make use of the io*map() functions, which are only provided for HAS_IOMEM, thus breaking compilation if HAS_IOMEM is not set. Fix this by moving the *_nvs_* routines into hibernate_nvs.c, which is only compiled if HAS_IOMEM is set. [rjw: Change the name of the new file to hibernate_nvs.c, add the license line to the header comment.] Signed-off-by: Cornelia Huck Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 4 ++ kernel/power/Makefile | 1 + kernel/power/hibernate_nvs.c | 135 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/swsusp.c | 122 -------------------------------------- 4 files changed, 140 insertions(+), 122 deletions(-) create mode 100644 kernel/power/hibernate_nvs.c (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 23bd4daeb96..72067cbdb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -116,9 +116,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATION_NVS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATION_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index eadb17fc8f5..c3b81c30e5d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -9,5 +9,6 @@ obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c new file mode 100644 index 00000000000..39ac698ef83 --- /dev/null +++ b/kernel/power/hibernate_nvs.c @@ -0,0 +1,135 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 87b901cb392..6a07f4dbf2f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -186,125 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); } - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * hibernate_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - hibernate_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * hibernate_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} -- cgit v1.2.3