aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/perf_counter.h24
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/perf_counter.c248
3 files changed, 228 insertions, 51 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 72460289c65..e5d25bf8f74 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
u64 irq_period;
u32 record_type;
- u32 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- raw : 1, /* raw event type */
- __reserved_1 : 29;
+ u32 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ raw : 1, /* raw event type */
+ inherit : 1, /* children inherit it */
+ __reserved_1 : 28;
u64 __reserved_2;
};
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
PERF_COUNTER_STATE_ACTIVE = 1,
};
+struct file;
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -156,7 +159,10 @@ struct perf_counter {
struct perf_counter_context *ctx;
struct task_struct *task;
+ struct file *filp;
+ unsigned int nr_inherited;
+ struct perf_counter *parent;
/*
* Protect attach/detach:
*/
@@ -210,13 +216,16 @@ struct perf_cpu_context {
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
extern const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter);
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
#else
static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace) { }
+static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu) { }
-static inline void perf_counter_init_task(struct task_struct *task) { }
+static inline void perf_counter_init_task(struct task_struct *child) { }
+static inline void perf_counter_exit_task(struct task_struct *child) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
static inline void hw_perf_restore(u64 ctrl) { }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f..d336c90a5f1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
-#ifdef CONFIG_FUTEX
/*
- * This must happen late, after the PID is not
- * hashed anymore:
+ * These must happen late, after the PID is not
+ * hashed anymore, but still at a point that may sleep:
*/
+ perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b2..f5e81dd193d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
list_del_init(&sibling->list_entry);
list_add_tail(&sibling->list_entry, &ctx->counter_list);
- WARN_ON_ONCE(!sibling->group_leader);
- WARN_ON_ONCE(sibling->group_leader == sibling);
sibling->group_leader = sibling;
}
}
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
u64 perf_flags;
/*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
perf_max_counters - perf_reserved_percpu);
}
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
int cpu = smp_processor_id();
+ unsigned long flags;
u64 perf_flags;
/*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
/*
* Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
/*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
*/
perf_flags = hw_perf_save_disable();
- list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
counter->state = PERF_COUNTER_STATE_OFF;
- }
+
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
@@ -526,26 +525,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
}
/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
- struct task_struct *task)
-{
- spin_lock_init(&ctx->lock);
- INIT_LIST_HEAD(&ctx->counter_list);
- ctx->nr_counters = 0;
- ctx->task = task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
- __perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
-/*
* Cross CPU call to read the hardware counter
*/
static void __hw_perf_counter_read(void *info)
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
cpuctx = &per_cpu(perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
- WARN_ON_ONCE(ctx->task);
return ctx;
}
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
static struct perf_counter *
perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
- struct perf_counter *group_leader)
+ struct perf_counter *group_leader,
+ gfp_t gfpflags)
{
const struct hw_perf_counter_ops *hw_ops;
struct perf_counter *counter;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = kzalloc(sizeof(*counter), gfpflags);
if (!counter)
return NULL;
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
hw_ops = NULL;
if (!hw_event->raw && hw_event->type < 0)
hw_ops = sw_perf_counter_init(counter);
- if (!hw_ops) {
+ if (!hw_ops)
hw_ops = hw_perf_counter_init(counter);
- }
if (!hw_ops) {
kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
struct perf_counter_context *ctx;
+ struct file *counter_file = NULL;
struct file *group_file = NULL;
int fput_needed = 0;
+ int fput_needed2 = 0;
int ret;
if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
}
ret = -EINVAL;
- counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+ counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
if (!counter)
goto err_put_context;
- perf_install_in_context(ctx, counter, cpu);
-
ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
if (ret < 0)
- goto err_remove_free_put_context;
+ goto err_free_put_context;
+
+ counter_file = fget_light(ret, &fput_needed2);
+ if (!counter_file)
+ goto err_free_put_context;
+
+ counter->filp = counter_file;
+ perf_install_in_context(ctx, counter, cpu);
+
+ fput_light(counter_file, fput_needed2);
out_fput:
fput_light(group_file, fput_needed);
return ret;
-err_remove_free_put_context:
- mutex_lock(&counter->mutex);
- perf_counter_remove_from_context(counter);
- mutex_unlock(&counter->mutex);
+err_free_put_context:
kfree(counter);
err_put_context:
@@ -1044,6 +1028,186 @@ err_put_context:
goto out_fput;
}
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ spin_lock_init(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *child_counter;
+
+ child_counter = perf_counter_alloc(&parent_counter->hw_event,
+ parent_counter->cpu, NULL,
+ GFP_ATOMIC);
+ if (!child_counter)
+ return -ENOMEM;
+
+ /*
+ * Link it up in the child's context:
+ */
+ child_counter->ctx = child_ctx;
+ child_counter->task = child;
+ list_add_counter(child_counter, child_ctx);
+ child_ctx->nr_counters++;
+
+ child_counter->parent = parent_counter;
+ parent_counter->nr_inherited++;
+ /*
+ * inherit into child's child as well:
+ */
+ child_counter->hw_event.inherit = 1;
+
+ /*
+ * Get a reference to the parent filp - we will fput it
+ * when the child counter exits. This is safe to do because
+ * we are in the parent and we know that the filp still
+ * exists and has a nonzero count:
+ */
+ atomic_long_inc(&parent_counter->filp->f_count);
+
+ return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+ struct perf_counter *child_counter,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *parent_counter;
+ u64 parent_val, child_val;
+ u64 perf_flags;
+
+ /*
+ * Disable and unlink this counter.
+ *
+ * Be careful about zapping the list - IRQ/NMI context
+ * could still be processing it:
+ */
+ local_irq_disable();
+ perf_flags = hw_perf_save_disable();
+
+ if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+ child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+ list_del_init(&child_counter->list_entry);
+
+ hw_perf_restore(perf_flags);
+ local_irq_enable();
+
+ parent_counter = child_counter->parent;
+ /*
+ * It can happen that parent exits first, and has counters
+ * that are still around due to the child reference. These
+ * counters need to be zapped - but otherwise linger.
+ */
+ if (!parent_counter)
+ return;
+
+ parent_val = atomic64_read(&parent_counter->count);
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+
+ fput(parent_counter->filp);
+
+ kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+ struct perf_counter *child_counter, *tmp;
+ struct perf_counter_context *child_ctx;
+
+ child_ctx = &child->perf_counter_ctx;
+
+ if (likely(!child_ctx->nr_counters))
+ return;
+
+ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+ list_entry)
+ __perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+ struct perf_counter_context *child_ctx, *parent_ctx;
+ struct perf_counter *counter, *parent_counter;
+ struct task_struct *parent = current;
+ unsigned long flags;
+
+ child_ctx = &child->perf_counter_ctx;
+ parent_ctx = &parent->perf_counter_ctx;
+
+ __perf_counter_init_context(child_ctx, child);
+
+ /*
+ * This is executed from the parent task context, so inherit
+ * counters that have been marked for cloning:
+ */
+
+ if (likely(!parent_ctx->nr_counters))
+ return;
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ spin_lock_irqsave(&parent_ctx->lock, flags);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+ if (!counter->hw_event.inherit || counter->group_leader != counter)
+ continue;
+
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inheritd counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ parent_counter = counter;
+ if (counter->parent)
+ parent_counter = counter->parent;
+
+ if (inherit_counter(parent_counter, parent,
+ parent_ctx, child, child_ctx))
+ break;
+ }
+
+ spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
static void __cpuinit perf_counter_init_cpu(int cpu)
{
struct perf_cpu_context *cpuctx;